#map = affine_map<()[s0] -> (4, s0 * 32)>
#map1 = affine_map<()[s0] -> (4, s0)>
#map2 = affine_map<()[s0] -> (s0, 2097152)>
#map3 = affine_map<()[s0] -> (s0 * 32)>
#map4 = affine_map<()[s0] -> (1, 1, s0 * 32, 131072)>
#map5 = affine_map<()[s0] -> (1, 1, s0 * 32, s0 * 32)>
#map6 = affine_map<()[s0] -> (4, 1, s0 * 32)>
#map7 = affine_map<()[s0] -> (4, 1, 1, s0 * 32)>
#map8 = affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)>
#map9 = affine_map<()[s0] -> (4, s0 * 32, 4096)>
#map10 = affine_map<()[s0] -> (4, s0 * 32, 1)>
#map11 = affine_map<()[s0] -> (4, s0 * 32, 1024)>
#map12 = affine_map<()[s0] -> (4, s0 * 32, 32, 128)>
#map13 = affine_map<()[s0] -> (4, s0 * 32, 8, 128)>
#map14 = affine_map<()[s0] -> (s0 * 32, 128)>
#map15 = affine_map<()[s0] -> (1, s0 * 32, 128)>
#map16 = affine_map<()[s0] -> (1, s0 * 32, 1, 128)>
#map17 = affine_map<()[s0] -> (4, s0 * 32, 1, 128)>
#map18 = affine_map<()[s0] -> (4, s0 * 32, 32, 64)>
#map19 = affine_map<()[s0] -> (4, s0 * 32, 8, 64)>
#map20 = affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)>
#map21 = affine_map<()[s0] -> (s0 * 64, 32, 8, 128)>
#map22 = affine_map<()[s0] -> (4, s0, 32, 8, 128)>
#map23 = affine_map<()[s0] -> (s0 * 4, 32, 8, 128)>
#map24 = affine_map<()[s0] -> (s0 * 4)>
#map25 = affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)>
#map26 = affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)>
#map27 = affine_map<()[s0] -> (4, 32, s0 * 32, 128)>
#map28 = affine_map<()[s0] -> (4, s0 * 32, 14336)>
#map29 = affine_map<()[s0] -> (s0 * 128, 4096)>
#map30 = affine_map<()[s0] -> (s0 * 128, 128256)>
#map31 = affine_map<()[s0] -> (4, s0 * 32, 128256)>
#map32 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map33 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
#map34 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
#map35 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
#map36 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>
#map37 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2)>
#map38 = affine_map<(d0, d1, d2, d3, d4) -> ()>
#map39 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)>
#map40 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
"builtin.module"() <{sym_name = "module"}> ({
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>, sym_name = "__auto.token_embd.weight", sym_visibility = "private", type = tensor<128256x4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.0.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.0.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.0.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.1.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.1.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.1.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.2.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.2.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.2.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.3.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.3.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.3.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.4.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.4.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.4.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.5.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.5.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.5.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.6.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.6.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.6.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.7.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.7.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.7.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.8.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.8.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.8.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.9.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.9.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.9.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.10.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.10.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.10.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.11.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.11.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.11.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.12.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.12.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.12.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.13.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.13.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.13.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.14.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.14.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.14.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.15.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.15.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.15.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.16.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.16.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.16.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.17.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.17.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.17.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.18.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.18.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.18.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.19.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.19.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.19.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.20.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.20.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.20.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.21.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.21.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.21.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.22.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.22.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.22.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.23.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.23.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.23.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.24.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.24.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.24.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.25.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.25.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.25.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.26.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.26.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.26.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.27.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.27.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.27.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.28.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.28.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.28.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.29.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.29.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.29.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.30.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.30.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.30.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.31.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.31.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.31.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.output_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16>, sym_name = "__auto.output.weight", sym_visibility = "private", type = tensor<128256x4096xbf16>}> : () -> ()
  "func.func"() <{arg_attrs = [{}, {}, {}, {}], function_type = (!torch.vtensor<[4,?],si64>, !torch.vtensor<[4],si64>, !torch.vtensor<[4,?],si64>, !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,128256],bf16>, sym_name = "prefill_bs4"}> ({
  ^bb0(%arg67: !torch.vtensor<[4,?],si64>, %arg68: !torch.vtensor<[4],si64>, %arg69: !torch.vtensor<[4,?],si64>, %arg70: !torch.tensor<[?,2097152],f8E4M3FNUZ>):
    %17186 = "util.global.load"() <{global = @__auto.token_embd.weight}> : () -> tensor<128256x4096xbf16>
    %17187 = "torch_c.from_builtin_tensor"(%17186) : (tensor<128256x4096xbf16>) -> !torch.vtensor<[128256,4096],bf16>
    %17188 = "util.global.load"() <{global = @__auto.blk.0.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17189 = "torch_c.from_builtin_tensor"(%17188) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17190 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17191 = "torch_c.from_builtin_tensor"(%17190) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17192 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17193 = "torch_c.from_builtin_tensor"(%17192) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17194 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17195 = "torch_c.from_builtin_tensor"(%17194) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17196 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17197 = "torch_c.from_builtin_tensor"(%17196) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17198 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17199 = "torch_c.from_builtin_tensor"(%17198) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17200 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17201 = "torch_c.from_builtin_tensor"(%17200) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17202 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17203 = "torch_c.from_builtin_tensor"(%17202) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17204 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17205 = "torch_c.from_builtin_tensor"(%17204) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17206 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17207 = "torch_c.from_builtin_tensor"(%17206) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17208 = "util.global.load"() <{global = @__auto.blk.0.attn_scale}> : () -> tensor<f32>
    %17209 = "torch_c.from_builtin_tensor"(%17208) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17210 = "util.global.load"() <{global = @"__auto.blk.0.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17211 = "torch_c.from_builtin_tensor"(%17210) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17212 = "util.global.load"() <{global = @"__auto.blk.0.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17213 = "torch_c.from_builtin_tensor"(%17212) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17214 = "util.global.load"() <{global = @__auto.blk.0.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17215 = "torch_c.from_builtin_tensor"(%17214) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17216 = "util.global.load"() <{global = @"__auto.blk.0.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17217 = "torch_c.from_builtin_tensor"(%17216) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17218 = "util.global.load"() <{global = @"__auto.blk.0.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17219 = "torch_c.from_builtin_tensor"(%17218) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17220 = "util.global.load"() <{global = @"__auto.blk.0.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17221 = "torch_c.from_builtin_tensor"(%17220) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17222 = "util.global.load"() <{global = @"__auto.blk.0.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17223 = "torch_c.from_builtin_tensor"(%17222) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17224 = "util.global.load"() <{global = @"__auto.blk.0.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17225 = "torch_c.from_builtin_tensor"(%17224) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17226 = "util.global.load"() <{global = @"__auto.blk.0.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17227 = "torch_c.from_builtin_tensor"(%17226) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17228 = "util.global.load"() <{global = @__auto.blk.1.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17229 = "torch_c.from_builtin_tensor"(%17228) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17230 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17231 = "torch_c.from_builtin_tensor"(%17230) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17232 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17233 = "torch_c.from_builtin_tensor"(%17232) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17234 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17235 = "torch_c.from_builtin_tensor"(%17234) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17236 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17237 = "torch_c.from_builtin_tensor"(%17236) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17238 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17239 = "torch_c.from_builtin_tensor"(%17238) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17240 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17241 = "torch_c.from_builtin_tensor"(%17240) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17242 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17243 = "torch_c.from_builtin_tensor"(%17242) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17244 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17245 = "torch_c.from_builtin_tensor"(%17244) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17246 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17247 = "torch_c.from_builtin_tensor"(%17246) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17248 = "util.global.load"() <{global = @__auto.blk.1.attn_scale}> : () -> tensor<f32>
    %17249 = "torch_c.from_builtin_tensor"(%17248) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17250 = "util.global.load"() <{global = @"__auto.blk.1.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17251 = "torch_c.from_builtin_tensor"(%17250) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17252 = "util.global.load"() <{global = @"__auto.blk.1.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17253 = "torch_c.from_builtin_tensor"(%17252) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17254 = "util.global.load"() <{global = @__auto.blk.1.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17255 = "torch_c.from_builtin_tensor"(%17254) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17256 = "util.global.load"() <{global = @"__auto.blk.1.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17257 = "torch_c.from_builtin_tensor"(%17256) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17258 = "util.global.load"() <{global = @"__auto.blk.1.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17259 = "torch_c.from_builtin_tensor"(%17258) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17260 = "util.global.load"() <{global = @"__auto.blk.1.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17261 = "torch_c.from_builtin_tensor"(%17260) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17262 = "util.global.load"() <{global = @"__auto.blk.1.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17263 = "torch_c.from_builtin_tensor"(%17262) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17264 = "util.global.load"() <{global = @"__auto.blk.1.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17265 = "torch_c.from_builtin_tensor"(%17264) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17266 = "util.global.load"() <{global = @"__auto.blk.1.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17267 = "torch_c.from_builtin_tensor"(%17266) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17268 = "util.global.load"() <{global = @__auto.blk.2.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17269 = "torch_c.from_builtin_tensor"(%17268) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17270 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17271 = "torch_c.from_builtin_tensor"(%17270) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17272 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17273 = "torch_c.from_builtin_tensor"(%17272) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17274 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17275 = "torch_c.from_builtin_tensor"(%17274) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17276 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17277 = "torch_c.from_builtin_tensor"(%17276) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17278 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17279 = "torch_c.from_builtin_tensor"(%17278) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17280 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17281 = "torch_c.from_builtin_tensor"(%17280) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17282 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17283 = "torch_c.from_builtin_tensor"(%17282) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17284 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17285 = "torch_c.from_builtin_tensor"(%17284) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17286 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17287 = "torch_c.from_builtin_tensor"(%17286) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17288 = "util.global.load"() <{global = @__auto.blk.2.attn_scale}> : () -> tensor<f32>
    %17289 = "torch_c.from_builtin_tensor"(%17288) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17290 = "util.global.load"() <{global = @"__auto.blk.2.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17291 = "torch_c.from_builtin_tensor"(%17290) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17292 = "util.global.load"() <{global = @"__auto.blk.2.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17293 = "torch_c.from_builtin_tensor"(%17292) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17294 = "util.global.load"() <{global = @__auto.blk.2.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17295 = "torch_c.from_builtin_tensor"(%17294) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17296 = "util.global.load"() <{global = @"__auto.blk.2.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17297 = "torch_c.from_builtin_tensor"(%17296) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17298 = "util.global.load"() <{global = @"__auto.blk.2.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17299 = "torch_c.from_builtin_tensor"(%17298) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17300 = "util.global.load"() <{global = @"__auto.blk.2.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17301 = "torch_c.from_builtin_tensor"(%17300) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17302 = "util.global.load"() <{global = @"__auto.blk.2.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17303 = "torch_c.from_builtin_tensor"(%17302) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17304 = "util.global.load"() <{global = @"__auto.blk.2.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17305 = "torch_c.from_builtin_tensor"(%17304) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17306 = "util.global.load"() <{global = @"__auto.blk.2.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17307 = "torch_c.from_builtin_tensor"(%17306) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17308 = "util.global.load"() <{global = @__auto.blk.3.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17309 = "torch_c.from_builtin_tensor"(%17308) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17310 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17311 = "torch_c.from_builtin_tensor"(%17310) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17312 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17313 = "torch_c.from_builtin_tensor"(%17312) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17314 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17315 = "torch_c.from_builtin_tensor"(%17314) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17316 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17317 = "torch_c.from_builtin_tensor"(%17316) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17318 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17319 = "torch_c.from_builtin_tensor"(%17318) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17320 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17321 = "torch_c.from_builtin_tensor"(%17320) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17322 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17323 = "torch_c.from_builtin_tensor"(%17322) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17324 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17325 = "torch_c.from_builtin_tensor"(%17324) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17326 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17327 = "torch_c.from_builtin_tensor"(%17326) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17328 = "util.global.load"() <{global = @__auto.blk.3.attn_scale}> : () -> tensor<f32>
    %17329 = "torch_c.from_builtin_tensor"(%17328) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17330 = "util.global.load"() <{global = @"__auto.blk.3.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17331 = "torch_c.from_builtin_tensor"(%17330) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17332 = "util.global.load"() <{global = @"__auto.blk.3.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17333 = "torch_c.from_builtin_tensor"(%17332) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17334 = "util.global.load"() <{global = @__auto.blk.3.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17335 = "torch_c.from_builtin_tensor"(%17334) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17336 = "util.global.load"() <{global = @"__auto.blk.3.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17337 = "torch_c.from_builtin_tensor"(%17336) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17338 = "util.global.load"() <{global = @"__auto.blk.3.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17339 = "torch_c.from_builtin_tensor"(%17338) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17340 = "util.global.load"() <{global = @"__auto.blk.3.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17341 = "torch_c.from_builtin_tensor"(%17340) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17342 = "util.global.load"() <{global = @"__auto.blk.3.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17343 = "torch_c.from_builtin_tensor"(%17342) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17344 = "util.global.load"() <{global = @"__auto.blk.3.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17345 = "torch_c.from_builtin_tensor"(%17344) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17346 = "util.global.load"() <{global = @"__auto.blk.3.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17347 = "torch_c.from_builtin_tensor"(%17346) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17348 = "util.global.load"() <{global = @__auto.blk.4.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17349 = "torch_c.from_builtin_tensor"(%17348) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17350 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17351 = "torch_c.from_builtin_tensor"(%17350) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17352 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17353 = "torch_c.from_builtin_tensor"(%17352) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17354 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17355 = "torch_c.from_builtin_tensor"(%17354) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17356 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17357 = "torch_c.from_builtin_tensor"(%17356) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17358 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17359 = "torch_c.from_builtin_tensor"(%17358) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17360 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17361 = "torch_c.from_builtin_tensor"(%17360) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17362 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17363 = "torch_c.from_builtin_tensor"(%17362) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17364 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17365 = "torch_c.from_builtin_tensor"(%17364) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17366 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17367 = "torch_c.from_builtin_tensor"(%17366) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17368 = "util.global.load"() <{global = @__auto.blk.4.attn_scale}> : () -> tensor<f32>
    %17369 = "torch_c.from_builtin_tensor"(%17368) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17370 = "util.global.load"() <{global = @"__auto.blk.4.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17371 = "torch_c.from_builtin_tensor"(%17370) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17372 = "util.global.load"() <{global = @"__auto.blk.4.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17373 = "torch_c.from_builtin_tensor"(%17372) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17374 = "util.global.load"() <{global = @__auto.blk.4.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17375 = "torch_c.from_builtin_tensor"(%17374) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17376 = "util.global.load"() <{global = @"__auto.blk.4.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17377 = "torch_c.from_builtin_tensor"(%17376) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17378 = "util.global.load"() <{global = @"__auto.blk.4.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17379 = "torch_c.from_builtin_tensor"(%17378) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17380 = "util.global.load"() <{global = @"__auto.blk.4.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17381 = "torch_c.from_builtin_tensor"(%17380) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17382 = "util.global.load"() <{global = @"__auto.blk.4.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17383 = "torch_c.from_builtin_tensor"(%17382) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17384 = "util.global.load"() <{global = @"__auto.blk.4.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17385 = "torch_c.from_builtin_tensor"(%17384) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17386 = "util.global.load"() <{global = @"__auto.blk.4.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17387 = "torch_c.from_builtin_tensor"(%17386) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17388 = "util.global.load"() <{global = @__auto.blk.5.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17389 = "torch_c.from_builtin_tensor"(%17388) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17390 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17391 = "torch_c.from_builtin_tensor"(%17390) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17392 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17393 = "torch_c.from_builtin_tensor"(%17392) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17394 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17395 = "torch_c.from_builtin_tensor"(%17394) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17396 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17397 = "torch_c.from_builtin_tensor"(%17396) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17398 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17399 = "torch_c.from_builtin_tensor"(%17398) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17400 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17401 = "torch_c.from_builtin_tensor"(%17400) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17402 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17403 = "torch_c.from_builtin_tensor"(%17402) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17404 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17405 = "torch_c.from_builtin_tensor"(%17404) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17406 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17407 = "torch_c.from_builtin_tensor"(%17406) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17408 = "util.global.load"() <{global = @__auto.blk.5.attn_scale}> : () -> tensor<f32>
    %17409 = "torch_c.from_builtin_tensor"(%17408) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17410 = "util.global.load"() <{global = @"__auto.blk.5.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17411 = "torch_c.from_builtin_tensor"(%17410) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17412 = "util.global.load"() <{global = @"__auto.blk.5.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17413 = "torch_c.from_builtin_tensor"(%17412) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17414 = "util.global.load"() <{global = @__auto.blk.5.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17415 = "torch_c.from_builtin_tensor"(%17414) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17416 = "util.global.load"() <{global = @"__auto.blk.5.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17417 = "torch_c.from_builtin_tensor"(%17416) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17418 = "util.global.load"() <{global = @"__auto.blk.5.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17419 = "torch_c.from_builtin_tensor"(%17418) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17420 = "util.global.load"() <{global = @"__auto.blk.5.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17421 = "torch_c.from_builtin_tensor"(%17420) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17422 = "util.global.load"() <{global = @"__auto.blk.5.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17423 = "torch_c.from_builtin_tensor"(%17422) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17424 = "util.global.load"() <{global = @"__auto.blk.5.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17425 = "torch_c.from_builtin_tensor"(%17424) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17426 = "util.global.load"() <{global = @"__auto.blk.5.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17427 = "torch_c.from_builtin_tensor"(%17426) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17428 = "util.global.load"() <{global = @__auto.blk.6.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17429 = "torch_c.from_builtin_tensor"(%17428) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17430 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17431 = "torch_c.from_builtin_tensor"(%17430) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17432 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17433 = "torch_c.from_builtin_tensor"(%17432) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17434 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17435 = "torch_c.from_builtin_tensor"(%17434) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17436 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17437 = "torch_c.from_builtin_tensor"(%17436) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17438 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17439 = "torch_c.from_builtin_tensor"(%17438) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17440 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17441 = "torch_c.from_builtin_tensor"(%17440) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17442 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17443 = "torch_c.from_builtin_tensor"(%17442) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17444 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17445 = "torch_c.from_builtin_tensor"(%17444) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17446 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17447 = "torch_c.from_builtin_tensor"(%17446) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17448 = "util.global.load"() <{global = @__auto.blk.6.attn_scale}> : () -> tensor<f32>
    %17449 = "torch_c.from_builtin_tensor"(%17448) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17450 = "util.global.load"() <{global = @"__auto.blk.6.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17451 = "torch_c.from_builtin_tensor"(%17450) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17452 = "util.global.load"() <{global = @"__auto.blk.6.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17453 = "torch_c.from_builtin_tensor"(%17452) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17454 = "util.global.load"() <{global = @__auto.blk.6.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17455 = "torch_c.from_builtin_tensor"(%17454) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17456 = "util.global.load"() <{global = @"__auto.blk.6.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17457 = "torch_c.from_builtin_tensor"(%17456) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17458 = "util.global.load"() <{global = @"__auto.blk.6.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17459 = "torch_c.from_builtin_tensor"(%17458) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17460 = "util.global.load"() <{global = @"__auto.blk.6.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17461 = "torch_c.from_builtin_tensor"(%17460) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17462 = "util.global.load"() <{global = @"__auto.blk.6.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17463 = "torch_c.from_builtin_tensor"(%17462) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17464 = "util.global.load"() <{global = @"__auto.blk.6.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17465 = "torch_c.from_builtin_tensor"(%17464) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17466 = "util.global.load"() <{global = @"__auto.blk.6.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17467 = "torch_c.from_builtin_tensor"(%17466) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17468 = "util.global.load"() <{global = @__auto.blk.7.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17469 = "torch_c.from_builtin_tensor"(%17468) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17470 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17471 = "torch_c.from_builtin_tensor"(%17470) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17472 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17473 = "torch_c.from_builtin_tensor"(%17472) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17474 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17475 = "torch_c.from_builtin_tensor"(%17474) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17476 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17477 = "torch_c.from_builtin_tensor"(%17476) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17478 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17479 = "torch_c.from_builtin_tensor"(%17478) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17480 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17481 = "torch_c.from_builtin_tensor"(%17480) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17482 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17483 = "torch_c.from_builtin_tensor"(%17482) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17484 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17485 = "torch_c.from_builtin_tensor"(%17484) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17486 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17487 = "torch_c.from_builtin_tensor"(%17486) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17488 = "util.global.load"() <{global = @__auto.blk.7.attn_scale}> : () -> tensor<f32>
    %17489 = "torch_c.from_builtin_tensor"(%17488) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17490 = "util.global.load"() <{global = @"__auto.blk.7.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17491 = "torch_c.from_builtin_tensor"(%17490) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17492 = "util.global.load"() <{global = @"__auto.blk.7.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17493 = "torch_c.from_builtin_tensor"(%17492) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17494 = "util.global.load"() <{global = @__auto.blk.7.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17495 = "torch_c.from_builtin_tensor"(%17494) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17496 = "util.global.load"() <{global = @"__auto.blk.7.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17497 = "torch_c.from_builtin_tensor"(%17496) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17498 = "util.global.load"() <{global = @"__auto.blk.7.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17499 = "torch_c.from_builtin_tensor"(%17498) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17500 = "util.global.load"() <{global = @"__auto.blk.7.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17501 = "torch_c.from_builtin_tensor"(%17500) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17502 = "util.global.load"() <{global = @"__auto.blk.7.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17503 = "torch_c.from_builtin_tensor"(%17502) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17504 = "util.global.load"() <{global = @"__auto.blk.7.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17505 = "torch_c.from_builtin_tensor"(%17504) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17506 = "util.global.load"() <{global = @"__auto.blk.7.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17507 = "torch_c.from_builtin_tensor"(%17506) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17508 = "util.global.load"() <{global = @__auto.blk.8.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17509 = "torch_c.from_builtin_tensor"(%17508) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17510 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17511 = "torch_c.from_builtin_tensor"(%17510) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17512 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17513 = "torch_c.from_builtin_tensor"(%17512) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17514 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17515 = "torch_c.from_builtin_tensor"(%17514) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17516 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17517 = "torch_c.from_builtin_tensor"(%17516) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17518 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17519 = "torch_c.from_builtin_tensor"(%17518) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17520 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17521 = "torch_c.from_builtin_tensor"(%17520) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17522 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17523 = "torch_c.from_builtin_tensor"(%17522) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17524 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17525 = "torch_c.from_builtin_tensor"(%17524) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17526 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17527 = "torch_c.from_builtin_tensor"(%17526) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17528 = "util.global.load"() <{global = @__auto.blk.8.attn_scale}> : () -> tensor<f32>
    %17529 = "torch_c.from_builtin_tensor"(%17528) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17530 = "util.global.load"() <{global = @"__auto.blk.8.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17531 = "torch_c.from_builtin_tensor"(%17530) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17532 = "util.global.load"() <{global = @"__auto.blk.8.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17533 = "torch_c.from_builtin_tensor"(%17532) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17534 = "util.global.load"() <{global = @__auto.blk.8.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17535 = "torch_c.from_builtin_tensor"(%17534) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17536 = "util.global.load"() <{global = @"__auto.blk.8.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17537 = "torch_c.from_builtin_tensor"(%17536) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17538 = "util.global.load"() <{global = @"__auto.blk.8.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17539 = "torch_c.from_builtin_tensor"(%17538) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17540 = "util.global.load"() <{global = @"__auto.blk.8.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17541 = "torch_c.from_builtin_tensor"(%17540) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17542 = "util.global.load"() <{global = @"__auto.blk.8.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17543 = "torch_c.from_builtin_tensor"(%17542) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17544 = "util.global.load"() <{global = @"__auto.blk.8.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17545 = "torch_c.from_builtin_tensor"(%17544) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17546 = "util.global.load"() <{global = @"__auto.blk.8.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17547 = "torch_c.from_builtin_tensor"(%17546) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17548 = "util.global.load"() <{global = @__auto.blk.9.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17549 = "torch_c.from_builtin_tensor"(%17548) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17550 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17551 = "torch_c.from_builtin_tensor"(%17550) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17552 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17553 = "torch_c.from_builtin_tensor"(%17552) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17554 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17555 = "torch_c.from_builtin_tensor"(%17554) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17556 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17557 = "torch_c.from_builtin_tensor"(%17556) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17558 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17559 = "torch_c.from_builtin_tensor"(%17558) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17560 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17561 = "torch_c.from_builtin_tensor"(%17560) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17562 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17563 = "torch_c.from_builtin_tensor"(%17562) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17564 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17565 = "torch_c.from_builtin_tensor"(%17564) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17566 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17567 = "torch_c.from_builtin_tensor"(%17566) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17568 = "util.global.load"() <{global = @__auto.blk.9.attn_scale}> : () -> tensor<f32>
    %17569 = "torch_c.from_builtin_tensor"(%17568) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17570 = "util.global.load"() <{global = @"__auto.blk.9.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17571 = "torch_c.from_builtin_tensor"(%17570) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17572 = "util.global.load"() <{global = @"__auto.blk.9.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17573 = "torch_c.from_builtin_tensor"(%17572) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17574 = "util.global.load"() <{global = @__auto.blk.9.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17575 = "torch_c.from_builtin_tensor"(%17574) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17576 = "util.global.load"() <{global = @"__auto.blk.9.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17577 = "torch_c.from_builtin_tensor"(%17576) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17578 = "util.global.load"() <{global = @"__auto.blk.9.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17579 = "torch_c.from_builtin_tensor"(%17578) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17580 = "util.global.load"() <{global = @"__auto.blk.9.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17581 = "torch_c.from_builtin_tensor"(%17580) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17582 = "util.global.load"() <{global = @"__auto.blk.9.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17583 = "torch_c.from_builtin_tensor"(%17582) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17584 = "util.global.load"() <{global = @"__auto.blk.9.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17585 = "torch_c.from_builtin_tensor"(%17584) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17586 = "util.global.load"() <{global = @"__auto.blk.9.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17587 = "torch_c.from_builtin_tensor"(%17586) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17588 = "util.global.load"() <{global = @__auto.blk.10.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17589 = "torch_c.from_builtin_tensor"(%17588) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17590 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17591 = "torch_c.from_builtin_tensor"(%17590) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17592 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17593 = "torch_c.from_builtin_tensor"(%17592) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17594 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17595 = "torch_c.from_builtin_tensor"(%17594) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17596 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17597 = "torch_c.from_builtin_tensor"(%17596) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17598 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17599 = "torch_c.from_builtin_tensor"(%17598) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17600 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17601 = "torch_c.from_builtin_tensor"(%17600) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17602 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17603 = "torch_c.from_builtin_tensor"(%17602) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17604 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17605 = "torch_c.from_builtin_tensor"(%17604) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17606 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17607 = "torch_c.from_builtin_tensor"(%17606) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17608 = "util.global.load"() <{global = @__auto.blk.10.attn_scale}> : () -> tensor<f32>
    %17609 = "torch_c.from_builtin_tensor"(%17608) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17610 = "util.global.load"() <{global = @"__auto.blk.10.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17611 = "torch_c.from_builtin_tensor"(%17610) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17612 = "util.global.load"() <{global = @"__auto.blk.10.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17613 = "torch_c.from_builtin_tensor"(%17612) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17614 = "util.global.load"() <{global = @__auto.blk.10.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17615 = "torch_c.from_builtin_tensor"(%17614) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17616 = "util.global.load"() <{global = @"__auto.blk.10.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17617 = "torch_c.from_builtin_tensor"(%17616) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17618 = "util.global.load"() <{global = @"__auto.blk.10.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17619 = "torch_c.from_builtin_tensor"(%17618) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17620 = "util.global.load"() <{global = @"__auto.blk.10.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17621 = "torch_c.from_builtin_tensor"(%17620) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17622 = "util.global.load"() <{global = @"__auto.blk.10.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17623 = "torch_c.from_builtin_tensor"(%17622) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17624 = "util.global.load"() <{global = @"__auto.blk.10.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17625 = "torch_c.from_builtin_tensor"(%17624) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17626 = "util.global.load"() <{global = @"__auto.blk.10.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17627 = "torch_c.from_builtin_tensor"(%17626) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17628 = "util.global.load"() <{global = @__auto.blk.11.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17629 = "torch_c.from_builtin_tensor"(%17628) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17630 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17631 = "torch_c.from_builtin_tensor"(%17630) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17632 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17633 = "torch_c.from_builtin_tensor"(%17632) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17634 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17635 = "torch_c.from_builtin_tensor"(%17634) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17636 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17637 = "torch_c.from_builtin_tensor"(%17636) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17638 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17639 = "torch_c.from_builtin_tensor"(%17638) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17640 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17641 = "torch_c.from_builtin_tensor"(%17640) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17642 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17643 = "torch_c.from_builtin_tensor"(%17642) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17644 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17645 = "torch_c.from_builtin_tensor"(%17644) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17646 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17647 = "torch_c.from_builtin_tensor"(%17646) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17648 = "util.global.load"() <{global = @__auto.blk.11.attn_scale}> : () -> tensor<f32>
    %17649 = "torch_c.from_builtin_tensor"(%17648) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17650 = "util.global.load"() <{global = @"__auto.blk.11.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17651 = "torch_c.from_builtin_tensor"(%17650) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17652 = "util.global.load"() <{global = @"__auto.blk.11.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17653 = "torch_c.from_builtin_tensor"(%17652) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17654 = "util.global.load"() <{global = @__auto.blk.11.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17655 = "torch_c.from_builtin_tensor"(%17654) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17656 = "util.global.load"() <{global = @"__auto.blk.11.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17657 = "torch_c.from_builtin_tensor"(%17656) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17658 = "util.global.load"() <{global = @"__auto.blk.11.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17659 = "torch_c.from_builtin_tensor"(%17658) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17660 = "util.global.load"() <{global = @"__auto.blk.11.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17661 = "torch_c.from_builtin_tensor"(%17660) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17662 = "util.global.load"() <{global = @"__auto.blk.11.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17663 = "torch_c.from_builtin_tensor"(%17662) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17664 = "util.global.load"() <{global = @"__auto.blk.11.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17665 = "torch_c.from_builtin_tensor"(%17664) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17666 = "util.global.load"() <{global = @"__auto.blk.11.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17667 = "torch_c.from_builtin_tensor"(%17666) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17668 = "util.global.load"() <{global = @__auto.blk.12.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17669 = "torch_c.from_builtin_tensor"(%17668) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17670 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17671 = "torch_c.from_builtin_tensor"(%17670) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17672 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17673 = "torch_c.from_builtin_tensor"(%17672) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17674 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17675 = "torch_c.from_builtin_tensor"(%17674) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17676 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17677 = "torch_c.from_builtin_tensor"(%17676) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17678 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17679 = "torch_c.from_builtin_tensor"(%17678) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17680 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17681 = "torch_c.from_builtin_tensor"(%17680) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17682 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17683 = "torch_c.from_builtin_tensor"(%17682) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17684 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17685 = "torch_c.from_builtin_tensor"(%17684) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17686 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17687 = "torch_c.from_builtin_tensor"(%17686) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17688 = "util.global.load"() <{global = @__auto.blk.12.attn_scale}> : () -> tensor<f32>
    %17689 = "torch_c.from_builtin_tensor"(%17688) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17690 = "util.global.load"() <{global = @"__auto.blk.12.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17691 = "torch_c.from_builtin_tensor"(%17690) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17692 = "util.global.load"() <{global = @"__auto.blk.12.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17693 = "torch_c.from_builtin_tensor"(%17692) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17694 = "util.global.load"() <{global = @__auto.blk.12.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17695 = "torch_c.from_builtin_tensor"(%17694) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17696 = "util.global.load"() <{global = @"__auto.blk.12.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17697 = "torch_c.from_builtin_tensor"(%17696) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17698 = "util.global.load"() <{global = @"__auto.blk.12.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17699 = "torch_c.from_builtin_tensor"(%17698) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17700 = "util.global.load"() <{global = @"__auto.blk.12.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17701 = "torch_c.from_builtin_tensor"(%17700) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17702 = "util.global.load"() <{global = @"__auto.blk.12.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17703 = "torch_c.from_builtin_tensor"(%17702) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17704 = "util.global.load"() <{global = @"__auto.blk.12.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17705 = "torch_c.from_builtin_tensor"(%17704) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17706 = "util.global.load"() <{global = @"__auto.blk.12.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17707 = "torch_c.from_builtin_tensor"(%17706) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17708 = "util.global.load"() <{global = @__auto.blk.13.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17709 = "torch_c.from_builtin_tensor"(%17708) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17710 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17711 = "torch_c.from_builtin_tensor"(%17710) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17712 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17713 = "torch_c.from_builtin_tensor"(%17712) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17714 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17715 = "torch_c.from_builtin_tensor"(%17714) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17716 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17717 = "torch_c.from_builtin_tensor"(%17716) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17718 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17719 = "torch_c.from_builtin_tensor"(%17718) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17720 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17721 = "torch_c.from_builtin_tensor"(%17720) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17722 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17723 = "torch_c.from_builtin_tensor"(%17722) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17724 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17725 = "torch_c.from_builtin_tensor"(%17724) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17726 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17727 = "torch_c.from_builtin_tensor"(%17726) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17728 = "util.global.load"() <{global = @__auto.blk.13.attn_scale}> : () -> tensor<f32>
    %17729 = "torch_c.from_builtin_tensor"(%17728) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17730 = "util.global.load"() <{global = @"__auto.blk.13.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17731 = "torch_c.from_builtin_tensor"(%17730) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17732 = "util.global.load"() <{global = @"__auto.blk.13.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17733 = "torch_c.from_builtin_tensor"(%17732) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17734 = "util.global.load"() <{global = @__auto.blk.13.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17735 = "torch_c.from_builtin_tensor"(%17734) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17736 = "util.global.load"() <{global = @"__auto.blk.13.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17737 = "torch_c.from_builtin_tensor"(%17736) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17738 = "util.global.load"() <{global = @"__auto.blk.13.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17739 = "torch_c.from_builtin_tensor"(%17738) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17740 = "util.global.load"() <{global = @"__auto.blk.13.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17741 = "torch_c.from_builtin_tensor"(%17740) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17742 = "util.global.load"() <{global = @"__auto.blk.13.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17743 = "torch_c.from_builtin_tensor"(%17742) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17744 = "util.global.load"() <{global = @"__auto.blk.13.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17745 = "torch_c.from_builtin_tensor"(%17744) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17746 = "util.global.load"() <{global = @"__auto.blk.13.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17747 = "torch_c.from_builtin_tensor"(%17746) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17748 = "util.global.load"() <{global = @__auto.blk.14.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17749 = "torch_c.from_builtin_tensor"(%17748) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17750 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17751 = "torch_c.from_builtin_tensor"(%17750) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17752 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17753 = "torch_c.from_builtin_tensor"(%17752) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17754 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17755 = "torch_c.from_builtin_tensor"(%17754) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17756 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17757 = "torch_c.from_builtin_tensor"(%17756) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17758 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17759 = "torch_c.from_builtin_tensor"(%17758) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17760 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17761 = "torch_c.from_builtin_tensor"(%17760) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17762 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17763 = "torch_c.from_builtin_tensor"(%17762) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17764 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17765 = "torch_c.from_builtin_tensor"(%17764) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17766 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17767 = "torch_c.from_builtin_tensor"(%17766) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17768 = "util.global.load"() <{global = @__auto.blk.14.attn_scale}> : () -> tensor<f32>
    %17769 = "torch_c.from_builtin_tensor"(%17768) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17770 = "util.global.load"() <{global = @"__auto.blk.14.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17771 = "torch_c.from_builtin_tensor"(%17770) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17772 = "util.global.load"() <{global = @"__auto.blk.14.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17773 = "torch_c.from_builtin_tensor"(%17772) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17774 = "util.global.load"() <{global = @__auto.blk.14.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17775 = "torch_c.from_builtin_tensor"(%17774) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17776 = "util.global.load"() <{global = @"__auto.blk.14.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17777 = "torch_c.from_builtin_tensor"(%17776) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17778 = "util.global.load"() <{global = @"__auto.blk.14.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17779 = "torch_c.from_builtin_tensor"(%17778) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17780 = "util.global.load"() <{global = @"__auto.blk.14.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17781 = "torch_c.from_builtin_tensor"(%17780) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17782 = "util.global.load"() <{global = @"__auto.blk.14.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17783 = "torch_c.from_builtin_tensor"(%17782) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17784 = "util.global.load"() <{global = @"__auto.blk.14.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17785 = "torch_c.from_builtin_tensor"(%17784) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17786 = "util.global.load"() <{global = @"__auto.blk.14.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17787 = "torch_c.from_builtin_tensor"(%17786) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17788 = "util.global.load"() <{global = @__auto.blk.15.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17789 = "torch_c.from_builtin_tensor"(%17788) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17790 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17791 = "torch_c.from_builtin_tensor"(%17790) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17792 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17793 = "torch_c.from_builtin_tensor"(%17792) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17794 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17795 = "torch_c.from_builtin_tensor"(%17794) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17796 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17797 = "torch_c.from_builtin_tensor"(%17796) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17798 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17799 = "torch_c.from_builtin_tensor"(%17798) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17800 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17801 = "torch_c.from_builtin_tensor"(%17800) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17802 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17803 = "torch_c.from_builtin_tensor"(%17802) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17804 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17805 = "torch_c.from_builtin_tensor"(%17804) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17806 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17807 = "torch_c.from_builtin_tensor"(%17806) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17808 = "util.global.load"() <{global = @__auto.blk.15.attn_scale}> : () -> tensor<f32>
    %17809 = "torch_c.from_builtin_tensor"(%17808) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17810 = "util.global.load"() <{global = @"__auto.blk.15.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17811 = "torch_c.from_builtin_tensor"(%17810) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17812 = "util.global.load"() <{global = @"__auto.blk.15.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17813 = "torch_c.from_builtin_tensor"(%17812) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17814 = "util.global.load"() <{global = @__auto.blk.15.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17815 = "torch_c.from_builtin_tensor"(%17814) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17816 = "util.global.load"() <{global = @"__auto.blk.15.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17817 = "torch_c.from_builtin_tensor"(%17816) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17818 = "util.global.load"() <{global = @"__auto.blk.15.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17819 = "torch_c.from_builtin_tensor"(%17818) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17820 = "util.global.load"() <{global = @"__auto.blk.15.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17821 = "torch_c.from_builtin_tensor"(%17820) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17822 = "util.global.load"() <{global = @"__auto.blk.15.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17823 = "torch_c.from_builtin_tensor"(%17822) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17824 = "util.global.load"() <{global = @"__auto.blk.15.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17825 = "torch_c.from_builtin_tensor"(%17824) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17826 = "util.global.load"() <{global = @"__auto.blk.15.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17827 = "torch_c.from_builtin_tensor"(%17826) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17828 = "util.global.load"() <{global = @__auto.blk.16.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17829 = "torch_c.from_builtin_tensor"(%17828) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17830 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17831 = "torch_c.from_builtin_tensor"(%17830) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17832 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17833 = "torch_c.from_builtin_tensor"(%17832) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17834 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17835 = "torch_c.from_builtin_tensor"(%17834) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17836 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17837 = "torch_c.from_builtin_tensor"(%17836) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17838 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17839 = "torch_c.from_builtin_tensor"(%17838) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17840 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17841 = "torch_c.from_builtin_tensor"(%17840) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17842 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17843 = "torch_c.from_builtin_tensor"(%17842) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17844 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17845 = "torch_c.from_builtin_tensor"(%17844) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17846 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17847 = "torch_c.from_builtin_tensor"(%17846) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17848 = "util.global.load"() <{global = @__auto.blk.16.attn_scale}> : () -> tensor<f32>
    %17849 = "torch_c.from_builtin_tensor"(%17848) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17850 = "util.global.load"() <{global = @"__auto.blk.16.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17851 = "torch_c.from_builtin_tensor"(%17850) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17852 = "util.global.load"() <{global = @"__auto.blk.16.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17853 = "torch_c.from_builtin_tensor"(%17852) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17854 = "util.global.load"() <{global = @__auto.blk.16.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17855 = "torch_c.from_builtin_tensor"(%17854) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17856 = "util.global.load"() <{global = @"__auto.blk.16.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17857 = "torch_c.from_builtin_tensor"(%17856) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17858 = "util.global.load"() <{global = @"__auto.blk.16.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17859 = "torch_c.from_builtin_tensor"(%17858) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17860 = "util.global.load"() <{global = @"__auto.blk.16.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17861 = "torch_c.from_builtin_tensor"(%17860) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17862 = "util.global.load"() <{global = @"__auto.blk.16.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17863 = "torch_c.from_builtin_tensor"(%17862) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17864 = "util.global.load"() <{global = @"__auto.blk.16.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17865 = "torch_c.from_builtin_tensor"(%17864) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17866 = "util.global.load"() <{global = @"__auto.blk.16.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17867 = "torch_c.from_builtin_tensor"(%17866) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17868 = "util.global.load"() <{global = @__auto.blk.17.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17869 = "torch_c.from_builtin_tensor"(%17868) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17870 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17871 = "torch_c.from_builtin_tensor"(%17870) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17872 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17873 = "torch_c.from_builtin_tensor"(%17872) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17874 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17875 = "torch_c.from_builtin_tensor"(%17874) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17876 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17877 = "torch_c.from_builtin_tensor"(%17876) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17878 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17879 = "torch_c.from_builtin_tensor"(%17878) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17880 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17881 = "torch_c.from_builtin_tensor"(%17880) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17882 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17883 = "torch_c.from_builtin_tensor"(%17882) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17884 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17885 = "torch_c.from_builtin_tensor"(%17884) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17886 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17887 = "torch_c.from_builtin_tensor"(%17886) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17888 = "util.global.load"() <{global = @__auto.blk.17.attn_scale}> : () -> tensor<f32>
    %17889 = "torch_c.from_builtin_tensor"(%17888) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17890 = "util.global.load"() <{global = @"__auto.blk.17.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17891 = "torch_c.from_builtin_tensor"(%17890) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17892 = "util.global.load"() <{global = @"__auto.blk.17.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17893 = "torch_c.from_builtin_tensor"(%17892) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17894 = "util.global.load"() <{global = @__auto.blk.17.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17895 = "torch_c.from_builtin_tensor"(%17894) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17896 = "util.global.load"() <{global = @"__auto.blk.17.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17897 = "torch_c.from_builtin_tensor"(%17896) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17898 = "util.global.load"() <{global = @"__auto.blk.17.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17899 = "torch_c.from_builtin_tensor"(%17898) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17900 = "util.global.load"() <{global = @"__auto.blk.17.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17901 = "torch_c.from_builtin_tensor"(%17900) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17902 = "util.global.load"() <{global = @"__auto.blk.17.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17903 = "torch_c.from_builtin_tensor"(%17902) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17904 = "util.global.load"() <{global = @"__auto.blk.17.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17905 = "torch_c.from_builtin_tensor"(%17904) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17906 = "util.global.load"() <{global = @"__auto.blk.17.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17907 = "torch_c.from_builtin_tensor"(%17906) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17908 = "util.global.load"() <{global = @__auto.blk.18.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17909 = "torch_c.from_builtin_tensor"(%17908) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17910 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17911 = "torch_c.from_builtin_tensor"(%17910) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17912 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17913 = "torch_c.from_builtin_tensor"(%17912) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17914 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17915 = "torch_c.from_builtin_tensor"(%17914) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17916 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17917 = "torch_c.from_builtin_tensor"(%17916) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17918 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17919 = "torch_c.from_builtin_tensor"(%17918) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17920 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17921 = "torch_c.from_builtin_tensor"(%17920) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17922 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17923 = "torch_c.from_builtin_tensor"(%17922) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17924 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17925 = "torch_c.from_builtin_tensor"(%17924) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17926 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17927 = "torch_c.from_builtin_tensor"(%17926) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17928 = "util.global.load"() <{global = @__auto.blk.18.attn_scale}> : () -> tensor<f32>
    %17929 = "torch_c.from_builtin_tensor"(%17928) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17930 = "util.global.load"() <{global = @"__auto.blk.18.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17931 = "torch_c.from_builtin_tensor"(%17930) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17932 = "util.global.load"() <{global = @"__auto.blk.18.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17933 = "torch_c.from_builtin_tensor"(%17932) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17934 = "util.global.load"() <{global = @__auto.blk.18.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17935 = "torch_c.from_builtin_tensor"(%17934) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17936 = "util.global.load"() <{global = @"__auto.blk.18.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17937 = "torch_c.from_builtin_tensor"(%17936) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17938 = "util.global.load"() <{global = @"__auto.blk.18.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17939 = "torch_c.from_builtin_tensor"(%17938) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17940 = "util.global.load"() <{global = @"__auto.blk.18.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17941 = "torch_c.from_builtin_tensor"(%17940) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17942 = "util.global.load"() <{global = @"__auto.blk.18.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17943 = "torch_c.from_builtin_tensor"(%17942) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17944 = "util.global.load"() <{global = @"__auto.blk.18.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17945 = "torch_c.from_builtin_tensor"(%17944) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17946 = "util.global.load"() <{global = @"__auto.blk.18.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17947 = "torch_c.from_builtin_tensor"(%17946) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17948 = "util.global.load"() <{global = @__auto.blk.19.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17949 = "torch_c.from_builtin_tensor"(%17948) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17950 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17951 = "torch_c.from_builtin_tensor"(%17950) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17952 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17953 = "torch_c.from_builtin_tensor"(%17952) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17954 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17955 = "torch_c.from_builtin_tensor"(%17954) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17956 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17957 = "torch_c.from_builtin_tensor"(%17956) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17958 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17959 = "torch_c.from_builtin_tensor"(%17958) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17960 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17961 = "torch_c.from_builtin_tensor"(%17960) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17962 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17963 = "torch_c.from_builtin_tensor"(%17962) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17964 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17965 = "torch_c.from_builtin_tensor"(%17964) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17966 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17967 = "torch_c.from_builtin_tensor"(%17966) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17968 = "util.global.load"() <{global = @__auto.blk.19.attn_scale}> : () -> tensor<f32>
    %17969 = "torch_c.from_builtin_tensor"(%17968) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17970 = "util.global.load"() <{global = @"__auto.blk.19.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17971 = "torch_c.from_builtin_tensor"(%17970) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17972 = "util.global.load"() <{global = @"__auto.blk.19.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17973 = "torch_c.from_builtin_tensor"(%17972) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17974 = "util.global.load"() <{global = @__auto.blk.19.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17975 = "torch_c.from_builtin_tensor"(%17974) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17976 = "util.global.load"() <{global = @"__auto.blk.19.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17977 = "torch_c.from_builtin_tensor"(%17976) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17978 = "util.global.load"() <{global = @"__auto.blk.19.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17979 = "torch_c.from_builtin_tensor"(%17978) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17980 = "util.global.load"() <{global = @"__auto.blk.19.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17981 = "torch_c.from_builtin_tensor"(%17980) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17982 = "util.global.load"() <{global = @"__auto.blk.19.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17983 = "torch_c.from_builtin_tensor"(%17982) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17984 = "util.global.load"() <{global = @"__auto.blk.19.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17985 = "torch_c.from_builtin_tensor"(%17984) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17986 = "util.global.load"() <{global = @"__auto.blk.19.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17987 = "torch_c.from_builtin_tensor"(%17986) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17988 = "util.global.load"() <{global = @__auto.blk.20.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17989 = "torch_c.from_builtin_tensor"(%17988) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17990 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17991 = "torch_c.from_builtin_tensor"(%17990) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17992 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17993 = "torch_c.from_builtin_tensor"(%17992) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17994 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17995 = "torch_c.from_builtin_tensor"(%17994) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17996 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17997 = "torch_c.from_builtin_tensor"(%17996) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17998 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17999 = "torch_c.from_builtin_tensor"(%17998) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18000 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18001 = "torch_c.from_builtin_tensor"(%18000) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18002 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18003 = "torch_c.from_builtin_tensor"(%18002) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18004 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18005 = "torch_c.from_builtin_tensor"(%18004) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18006 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18007 = "torch_c.from_builtin_tensor"(%18006) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18008 = "util.global.load"() <{global = @__auto.blk.20.attn_scale}> : () -> tensor<f32>
    %18009 = "torch_c.from_builtin_tensor"(%18008) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18010 = "util.global.load"() <{global = @"__auto.blk.20.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18011 = "torch_c.from_builtin_tensor"(%18010) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18012 = "util.global.load"() <{global = @"__auto.blk.20.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18013 = "torch_c.from_builtin_tensor"(%18012) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18014 = "util.global.load"() <{global = @__auto.blk.20.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18015 = "torch_c.from_builtin_tensor"(%18014) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18016 = "util.global.load"() <{global = @"__auto.blk.20.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18017 = "torch_c.from_builtin_tensor"(%18016) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18018 = "util.global.load"() <{global = @"__auto.blk.20.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18019 = "torch_c.from_builtin_tensor"(%18018) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18020 = "util.global.load"() <{global = @"__auto.blk.20.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18021 = "torch_c.from_builtin_tensor"(%18020) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18022 = "util.global.load"() <{global = @"__auto.blk.20.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18023 = "torch_c.from_builtin_tensor"(%18022) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18024 = "util.global.load"() <{global = @"__auto.blk.20.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18025 = "torch_c.from_builtin_tensor"(%18024) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18026 = "util.global.load"() <{global = @"__auto.blk.20.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18027 = "torch_c.from_builtin_tensor"(%18026) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18028 = "util.global.load"() <{global = @__auto.blk.21.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18029 = "torch_c.from_builtin_tensor"(%18028) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18030 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18031 = "torch_c.from_builtin_tensor"(%18030) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18032 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18033 = "torch_c.from_builtin_tensor"(%18032) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18034 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18035 = "torch_c.from_builtin_tensor"(%18034) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18036 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18037 = "torch_c.from_builtin_tensor"(%18036) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18038 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18039 = "torch_c.from_builtin_tensor"(%18038) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18040 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18041 = "torch_c.from_builtin_tensor"(%18040) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18042 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18043 = "torch_c.from_builtin_tensor"(%18042) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18044 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18045 = "torch_c.from_builtin_tensor"(%18044) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18046 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18047 = "torch_c.from_builtin_tensor"(%18046) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18048 = "util.global.load"() <{global = @__auto.blk.21.attn_scale}> : () -> tensor<f32>
    %18049 = "torch_c.from_builtin_tensor"(%18048) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18050 = "util.global.load"() <{global = @"__auto.blk.21.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18051 = "torch_c.from_builtin_tensor"(%18050) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18052 = "util.global.load"() <{global = @"__auto.blk.21.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18053 = "torch_c.from_builtin_tensor"(%18052) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18054 = "util.global.load"() <{global = @__auto.blk.21.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18055 = "torch_c.from_builtin_tensor"(%18054) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18056 = "util.global.load"() <{global = @"__auto.blk.21.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18057 = "torch_c.from_builtin_tensor"(%18056) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18058 = "util.global.load"() <{global = @"__auto.blk.21.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18059 = "torch_c.from_builtin_tensor"(%18058) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18060 = "util.global.load"() <{global = @"__auto.blk.21.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18061 = "torch_c.from_builtin_tensor"(%18060) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18062 = "util.global.load"() <{global = @"__auto.blk.21.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18063 = "torch_c.from_builtin_tensor"(%18062) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18064 = "util.global.load"() <{global = @"__auto.blk.21.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18065 = "torch_c.from_builtin_tensor"(%18064) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18066 = "util.global.load"() <{global = @"__auto.blk.21.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18067 = "torch_c.from_builtin_tensor"(%18066) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18068 = "util.global.load"() <{global = @__auto.blk.22.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18069 = "torch_c.from_builtin_tensor"(%18068) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18070 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18071 = "torch_c.from_builtin_tensor"(%18070) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18072 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18073 = "torch_c.from_builtin_tensor"(%18072) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18074 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18075 = "torch_c.from_builtin_tensor"(%18074) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18076 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18077 = "torch_c.from_builtin_tensor"(%18076) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18078 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18079 = "torch_c.from_builtin_tensor"(%18078) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18080 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18081 = "torch_c.from_builtin_tensor"(%18080) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18082 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18083 = "torch_c.from_builtin_tensor"(%18082) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18084 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18085 = "torch_c.from_builtin_tensor"(%18084) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18086 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18087 = "torch_c.from_builtin_tensor"(%18086) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18088 = "util.global.load"() <{global = @__auto.blk.22.attn_scale}> : () -> tensor<f32>
    %18089 = "torch_c.from_builtin_tensor"(%18088) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18090 = "util.global.load"() <{global = @"__auto.blk.22.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18091 = "torch_c.from_builtin_tensor"(%18090) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18092 = "util.global.load"() <{global = @"__auto.blk.22.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18093 = "torch_c.from_builtin_tensor"(%18092) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18094 = "util.global.load"() <{global = @__auto.blk.22.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18095 = "torch_c.from_builtin_tensor"(%18094) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18096 = "util.global.load"() <{global = @"__auto.blk.22.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18097 = "torch_c.from_builtin_tensor"(%18096) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18098 = "util.global.load"() <{global = @"__auto.blk.22.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18099 = "torch_c.from_builtin_tensor"(%18098) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18100 = "util.global.load"() <{global = @"__auto.blk.22.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18101 = "torch_c.from_builtin_tensor"(%18100) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18102 = "util.global.load"() <{global = @"__auto.blk.22.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18103 = "torch_c.from_builtin_tensor"(%18102) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18104 = "util.global.load"() <{global = @"__auto.blk.22.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18105 = "torch_c.from_builtin_tensor"(%18104) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18106 = "util.global.load"() <{global = @"__auto.blk.22.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18107 = "torch_c.from_builtin_tensor"(%18106) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18108 = "util.global.load"() <{global = @__auto.blk.23.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18109 = "torch_c.from_builtin_tensor"(%18108) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18110 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18111 = "torch_c.from_builtin_tensor"(%18110) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18112 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18113 = "torch_c.from_builtin_tensor"(%18112) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18114 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18115 = "torch_c.from_builtin_tensor"(%18114) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18116 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18117 = "torch_c.from_builtin_tensor"(%18116) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18118 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18119 = "torch_c.from_builtin_tensor"(%18118) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18120 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18121 = "torch_c.from_builtin_tensor"(%18120) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18122 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18123 = "torch_c.from_builtin_tensor"(%18122) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18124 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18125 = "torch_c.from_builtin_tensor"(%18124) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18126 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18127 = "torch_c.from_builtin_tensor"(%18126) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18128 = "util.global.load"() <{global = @__auto.blk.23.attn_scale}> : () -> tensor<f32>
    %18129 = "torch_c.from_builtin_tensor"(%18128) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18130 = "util.global.load"() <{global = @"__auto.blk.23.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18131 = "torch_c.from_builtin_tensor"(%18130) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18132 = "util.global.load"() <{global = @"__auto.blk.23.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18133 = "torch_c.from_builtin_tensor"(%18132) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18134 = "util.global.load"() <{global = @__auto.blk.23.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18135 = "torch_c.from_builtin_tensor"(%18134) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18136 = "util.global.load"() <{global = @"__auto.blk.23.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18137 = "torch_c.from_builtin_tensor"(%18136) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18138 = "util.global.load"() <{global = @"__auto.blk.23.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18139 = "torch_c.from_builtin_tensor"(%18138) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18140 = "util.global.load"() <{global = @"__auto.blk.23.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18141 = "torch_c.from_builtin_tensor"(%18140) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18142 = "util.global.load"() <{global = @"__auto.blk.23.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18143 = "torch_c.from_builtin_tensor"(%18142) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18144 = "util.global.load"() <{global = @"__auto.blk.23.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18145 = "torch_c.from_builtin_tensor"(%18144) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18146 = "util.global.load"() <{global = @"__auto.blk.23.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18147 = "torch_c.from_builtin_tensor"(%18146) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18148 = "util.global.load"() <{global = @__auto.blk.24.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18149 = "torch_c.from_builtin_tensor"(%18148) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18150 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18151 = "torch_c.from_builtin_tensor"(%18150) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18152 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18153 = "torch_c.from_builtin_tensor"(%18152) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18154 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18155 = "torch_c.from_builtin_tensor"(%18154) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18156 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18157 = "torch_c.from_builtin_tensor"(%18156) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18158 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18159 = "torch_c.from_builtin_tensor"(%18158) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18160 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18161 = "torch_c.from_builtin_tensor"(%18160) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18162 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18163 = "torch_c.from_builtin_tensor"(%18162) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18164 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18165 = "torch_c.from_builtin_tensor"(%18164) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18166 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18167 = "torch_c.from_builtin_tensor"(%18166) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18168 = "util.global.load"() <{global = @__auto.blk.24.attn_scale}> : () -> tensor<f32>
    %18169 = "torch_c.from_builtin_tensor"(%18168) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18170 = "util.global.load"() <{global = @"__auto.blk.24.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18171 = "torch_c.from_builtin_tensor"(%18170) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18172 = "util.global.load"() <{global = @"__auto.blk.24.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18173 = "torch_c.from_builtin_tensor"(%18172) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18174 = "util.global.load"() <{global = @__auto.blk.24.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18175 = "torch_c.from_builtin_tensor"(%18174) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18176 = "util.global.load"() <{global = @"__auto.blk.24.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18177 = "torch_c.from_builtin_tensor"(%18176) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18178 = "util.global.load"() <{global = @"__auto.blk.24.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18179 = "torch_c.from_builtin_tensor"(%18178) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18180 = "util.global.load"() <{global = @"__auto.blk.24.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18181 = "torch_c.from_builtin_tensor"(%18180) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18182 = "util.global.load"() <{global = @"__auto.blk.24.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18183 = "torch_c.from_builtin_tensor"(%18182) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18184 = "util.global.load"() <{global = @"__auto.blk.24.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18185 = "torch_c.from_builtin_tensor"(%18184) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18186 = "util.global.load"() <{global = @"__auto.blk.24.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18187 = "torch_c.from_builtin_tensor"(%18186) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18188 = "util.global.load"() <{global = @__auto.blk.25.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18189 = "torch_c.from_builtin_tensor"(%18188) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18190 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18191 = "torch_c.from_builtin_tensor"(%18190) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18192 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18193 = "torch_c.from_builtin_tensor"(%18192) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18194 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18195 = "torch_c.from_builtin_tensor"(%18194) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18196 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18197 = "torch_c.from_builtin_tensor"(%18196) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18198 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18199 = "torch_c.from_builtin_tensor"(%18198) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18200 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18201 = "torch_c.from_builtin_tensor"(%18200) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18202 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18203 = "torch_c.from_builtin_tensor"(%18202) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18204 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18205 = "torch_c.from_builtin_tensor"(%18204) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18206 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18207 = "torch_c.from_builtin_tensor"(%18206) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18208 = "util.global.load"() <{global = @__auto.blk.25.attn_scale}> : () -> tensor<f32>
    %18209 = "torch_c.from_builtin_tensor"(%18208) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18210 = "util.global.load"() <{global = @"__auto.blk.25.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18211 = "torch_c.from_builtin_tensor"(%18210) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18212 = "util.global.load"() <{global = @"__auto.blk.25.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18213 = "torch_c.from_builtin_tensor"(%18212) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18214 = "util.global.load"() <{global = @__auto.blk.25.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18215 = "torch_c.from_builtin_tensor"(%18214) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18216 = "util.global.load"() <{global = @"__auto.blk.25.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18217 = "torch_c.from_builtin_tensor"(%18216) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18218 = "util.global.load"() <{global = @"__auto.blk.25.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18219 = "torch_c.from_builtin_tensor"(%18218) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18220 = "util.global.load"() <{global = @"__auto.blk.25.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18221 = "torch_c.from_builtin_tensor"(%18220) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18222 = "util.global.load"() <{global = @"__auto.blk.25.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18223 = "torch_c.from_builtin_tensor"(%18222) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18224 = "util.global.load"() <{global = @"__auto.blk.25.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18225 = "torch_c.from_builtin_tensor"(%18224) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18226 = "util.global.load"() <{global = @"__auto.blk.25.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18227 = "torch_c.from_builtin_tensor"(%18226) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18228 = "util.global.load"() <{global = @__auto.blk.26.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18229 = "torch_c.from_builtin_tensor"(%18228) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18230 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18231 = "torch_c.from_builtin_tensor"(%18230) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18232 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18233 = "torch_c.from_builtin_tensor"(%18232) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18234 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18235 = "torch_c.from_builtin_tensor"(%18234) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18236 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18237 = "torch_c.from_builtin_tensor"(%18236) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18238 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18239 = "torch_c.from_builtin_tensor"(%18238) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18240 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18241 = "torch_c.from_builtin_tensor"(%18240) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18242 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18243 = "torch_c.from_builtin_tensor"(%18242) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18244 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18245 = "torch_c.from_builtin_tensor"(%18244) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18246 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18247 = "torch_c.from_builtin_tensor"(%18246) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18248 = "util.global.load"() <{global = @__auto.blk.26.attn_scale}> : () -> tensor<f32>
    %18249 = "torch_c.from_builtin_tensor"(%18248) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18250 = "util.global.load"() <{global = @"__auto.blk.26.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18251 = "torch_c.from_builtin_tensor"(%18250) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18252 = "util.global.load"() <{global = @"__auto.blk.26.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18253 = "torch_c.from_builtin_tensor"(%18252) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18254 = "util.global.load"() <{global = @__auto.blk.26.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18255 = "torch_c.from_builtin_tensor"(%18254) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18256 = "util.global.load"() <{global = @"__auto.blk.26.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18257 = "torch_c.from_builtin_tensor"(%18256) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18258 = "util.global.load"() <{global = @"__auto.blk.26.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18259 = "torch_c.from_builtin_tensor"(%18258) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18260 = "util.global.load"() <{global = @"__auto.blk.26.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18261 = "torch_c.from_builtin_tensor"(%18260) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18262 = "util.global.load"() <{global = @"__auto.blk.26.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18263 = "torch_c.from_builtin_tensor"(%18262) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18264 = "util.global.load"() <{global = @"__auto.blk.26.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18265 = "torch_c.from_builtin_tensor"(%18264) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18266 = "util.global.load"() <{global = @"__auto.blk.26.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18267 = "torch_c.from_builtin_tensor"(%18266) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18268 = "util.global.load"() <{global = @__auto.blk.27.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18269 = "torch_c.from_builtin_tensor"(%18268) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18270 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18271 = "torch_c.from_builtin_tensor"(%18270) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18272 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18273 = "torch_c.from_builtin_tensor"(%18272) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18274 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18275 = "torch_c.from_builtin_tensor"(%18274) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18276 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18277 = "torch_c.from_builtin_tensor"(%18276) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18278 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18279 = "torch_c.from_builtin_tensor"(%18278) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18280 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18281 = "torch_c.from_builtin_tensor"(%18280) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18282 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18283 = "torch_c.from_builtin_tensor"(%18282) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18284 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18285 = "torch_c.from_builtin_tensor"(%18284) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18286 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18287 = "torch_c.from_builtin_tensor"(%18286) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18288 = "util.global.load"() <{global = @__auto.blk.27.attn_scale}> : () -> tensor<f32>
    %18289 = "torch_c.from_builtin_tensor"(%18288) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18290 = "util.global.load"() <{global = @"__auto.blk.27.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18291 = "torch_c.from_builtin_tensor"(%18290) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18292 = "util.global.load"() <{global = @"__auto.blk.27.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18293 = "torch_c.from_builtin_tensor"(%18292) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18294 = "util.global.load"() <{global = @__auto.blk.27.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18295 = "torch_c.from_builtin_tensor"(%18294) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18296 = "util.global.load"() <{global = @"__auto.blk.27.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18297 = "torch_c.from_builtin_tensor"(%18296) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18298 = "util.global.load"() <{global = @"__auto.blk.27.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18299 = "torch_c.from_builtin_tensor"(%18298) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18300 = "util.global.load"() <{global = @"__auto.blk.27.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18301 = "torch_c.from_builtin_tensor"(%18300) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18302 = "util.global.load"() <{global = @"__auto.blk.27.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18303 = "torch_c.from_builtin_tensor"(%18302) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18304 = "util.global.load"() <{global = @"__auto.blk.27.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18305 = "torch_c.from_builtin_tensor"(%18304) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18306 = "util.global.load"() <{global = @"__auto.blk.27.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18307 = "torch_c.from_builtin_tensor"(%18306) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18308 = "util.global.load"() <{global = @__auto.blk.28.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18309 = "torch_c.from_builtin_tensor"(%18308) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18310 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18311 = "torch_c.from_builtin_tensor"(%18310) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18312 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18313 = "torch_c.from_builtin_tensor"(%18312) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18314 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18315 = "torch_c.from_builtin_tensor"(%18314) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18316 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18317 = "torch_c.from_builtin_tensor"(%18316) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18318 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18319 = "torch_c.from_builtin_tensor"(%18318) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18320 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18321 = "torch_c.from_builtin_tensor"(%18320) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18322 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18323 = "torch_c.from_builtin_tensor"(%18322) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18324 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18325 = "torch_c.from_builtin_tensor"(%18324) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18326 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18327 = "torch_c.from_builtin_tensor"(%18326) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18328 = "util.global.load"() <{global = @__auto.blk.28.attn_scale}> : () -> tensor<f32>
    %18329 = "torch_c.from_builtin_tensor"(%18328) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18330 = "util.global.load"() <{global = @"__auto.blk.28.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18331 = "torch_c.from_builtin_tensor"(%18330) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18332 = "util.global.load"() <{global = @"__auto.blk.28.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18333 = "torch_c.from_builtin_tensor"(%18332) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18334 = "util.global.load"() <{global = @__auto.blk.28.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18335 = "torch_c.from_builtin_tensor"(%18334) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18336 = "util.global.load"() <{global = @"__auto.blk.28.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18337 = "torch_c.from_builtin_tensor"(%18336) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18338 = "util.global.load"() <{global = @"__auto.blk.28.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18339 = "torch_c.from_builtin_tensor"(%18338) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18340 = "util.global.load"() <{global = @"__auto.blk.28.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18341 = "torch_c.from_builtin_tensor"(%18340) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18342 = "util.global.load"() <{global = @"__auto.blk.28.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18343 = "torch_c.from_builtin_tensor"(%18342) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18344 = "util.global.load"() <{global = @"__auto.blk.28.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18345 = "torch_c.from_builtin_tensor"(%18344) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18346 = "util.global.load"() <{global = @"__auto.blk.28.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18347 = "torch_c.from_builtin_tensor"(%18346) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18348 = "util.global.load"() <{global = @__auto.blk.29.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18349 = "torch_c.from_builtin_tensor"(%18348) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18350 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18351 = "torch_c.from_builtin_tensor"(%18350) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18352 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18353 = "torch_c.from_builtin_tensor"(%18352) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18354 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18355 = "torch_c.from_builtin_tensor"(%18354) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18356 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18357 = "torch_c.from_builtin_tensor"(%18356) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18358 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18359 = "torch_c.from_builtin_tensor"(%18358) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18360 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18361 = "torch_c.from_builtin_tensor"(%18360) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18362 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18363 = "torch_c.from_builtin_tensor"(%18362) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18364 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18365 = "torch_c.from_builtin_tensor"(%18364) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18366 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18367 = "torch_c.from_builtin_tensor"(%18366) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18368 = "util.global.load"() <{global = @__auto.blk.29.attn_scale}> : () -> tensor<f32>
    %18369 = "torch_c.from_builtin_tensor"(%18368) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18370 = "util.global.load"() <{global = @"__auto.blk.29.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18371 = "torch_c.from_builtin_tensor"(%18370) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18372 = "util.global.load"() <{global = @"__auto.blk.29.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18373 = "torch_c.from_builtin_tensor"(%18372) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18374 = "util.global.load"() <{global = @__auto.blk.29.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18375 = "torch_c.from_builtin_tensor"(%18374) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18376 = "util.global.load"() <{global = @"__auto.blk.29.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18377 = "torch_c.from_builtin_tensor"(%18376) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18378 = "util.global.load"() <{global = @"__auto.blk.29.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18379 = "torch_c.from_builtin_tensor"(%18378) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18380 = "util.global.load"() <{global = @"__auto.blk.29.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18381 = "torch_c.from_builtin_tensor"(%18380) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18382 = "util.global.load"() <{global = @"__auto.blk.29.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18383 = "torch_c.from_builtin_tensor"(%18382) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18384 = "util.global.load"() <{global = @"__auto.blk.29.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18385 = "torch_c.from_builtin_tensor"(%18384) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18386 = "util.global.load"() <{global = @"__auto.blk.29.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18387 = "torch_c.from_builtin_tensor"(%18386) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18388 = "util.global.load"() <{global = @__auto.blk.30.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18389 = "torch_c.from_builtin_tensor"(%18388) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18390 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18391 = "torch_c.from_builtin_tensor"(%18390) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18392 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18393 = "torch_c.from_builtin_tensor"(%18392) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18394 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18395 = "torch_c.from_builtin_tensor"(%18394) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18396 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18397 = "torch_c.from_builtin_tensor"(%18396) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18398 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18399 = "torch_c.from_builtin_tensor"(%18398) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18400 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18401 = "torch_c.from_builtin_tensor"(%18400) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18402 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18403 = "torch_c.from_builtin_tensor"(%18402) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18404 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18405 = "torch_c.from_builtin_tensor"(%18404) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18406 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18407 = "torch_c.from_builtin_tensor"(%18406) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18408 = "util.global.load"() <{global = @__auto.blk.30.attn_scale}> : () -> tensor<f32>
    %18409 = "torch_c.from_builtin_tensor"(%18408) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18410 = "util.global.load"() <{global = @"__auto.blk.30.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18411 = "torch_c.from_builtin_tensor"(%18410) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18412 = "util.global.load"() <{global = @"__auto.blk.30.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18413 = "torch_c.from_builtin_tensor"(%18412) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18414 = "util.global.load"() <{global = @__auto.blk.30.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18415 = "torch_c.from_builtin_tensor"(%18414) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18416 = "util.global.load"() <{global = @"__auto.blk.30.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18417 = "torch_c.from_builtin_tensor"(%18416) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18418 = "util.global.load"() <{global = @"__auto.blk.30.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18419 = "torch_c.from_builtin_tensor"(%18418) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18420 = "util.global.load"() <{global = @"__auto.blk.30.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18421 = "torch_c.from_builtin_tensor"(%18420) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18422 = "util.global.load"() <{global = @"__auto.blk.30.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18423 = "torch_c.from_builtin_tensor"(%18422) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18424 = "util.global.load"() <{global = @"__auto.blk.30.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18425 = "torch_c.from_builtin_tensor"(%18424) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18426 = "util.global.load"() <{global = @"__auto.blk.30.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18427 = "torch_c.from_builtin_tensor"(%18426) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18428 = "util.global.load"() <{global = @__auto.blk.31.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18429 = "torch_c.from_builtin_tensor"(%18428) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18430 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18431 = "torch_c.from_builtin_tensor"(%18430) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18432 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18433 = "torch_c.from_builtin_tensor"(%18432) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18434 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18435 = "torch_c.from_builtin_tensor"(%18434) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18436 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18437 = "torch_c.from_builtin_tensor"(%18436) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18438 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18439 = "torch_c.from_builtin_tensor"(%18438) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18440 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18441 = "torch_c.from_builtin_tensor"(%18440) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18442 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18443 = "torch_c.from_builtin_tensor"(%18442) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18444 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18445 = "torch_c.from_builtin_tensor"(%18444) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18446 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18447 = "torch_c.from_builtin_tensor"(%18446) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18448 = "util.global.load"() <{global = @__auto.blk.31.attn_scale}> : () -> tensor<f32>
    %18449 = "torch_c.from_builtin_tensor"(%18448) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18450 = "util.global.load"() <{global = @"__auto.blk.31.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18451 = "torch_c.from_builtin_tensor"(%18450) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18452 = "util.global.load"() <{global = @"__auto.blk.31.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18453 = "torch_c.from_builtin_tensor"(%18452) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18454 = "util.global.load"() <{global = @__auto.blk.31.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18455 = "torch_c.from_builtin_tensor"(%18454) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18456 = "util.global.load"() <{global = @"__auto.blk.31.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18457 = "torch_c.from_builtin_tensor"(%18456) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18458 = "util.global.load"() <{global = @"__auto.blk.31.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18459 = "torch_c.from_builtin_tensor"(%18458) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18460 = "util.global.load"() <{global = @"__auto.blk.31.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18461 = "torch_c.from_builtin_tensor"(%18460) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18462 = "util.global.load"() <{global = @"__auto.blk.31.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18463 = "torch_c.from_builtin_tensor"(%18462) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18464 = "util.global.load"() <{global = @"__auto.blk.31.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18465 = "torch_c.from_builtin_tensor"(%18464) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18466 = "util.global.load"() <{global = @"__auto.blk.31.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18467 = "torch_c.from_builtin_tensor"(%18466) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18468 = "util.global.load"() <{global = @__auto.output_norm.weight}> : () -> tensor<4096xbf16>
    %18469 = "torch_c.from_builtin_tensor"(%18468) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18470 = "util.global.load"() <{global = @__auto.output.weight}> : () -> tensor<128256x4096xbf16>
    %18471 = "torch_c.from_builtin_tensor"(%18470) : (tensor<128256x4096xbf16>) -> !torch.vtensor<[128256,4096],bf16>
    %18472 = "torch.copy.to_vtensor"(%arg70) : (!torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %18473 = "torch.symbolic_int"() <{max_val = 131040 : i64, min_val = 64 : i64, symbol_name = "32*s1"}> : () -> !torch.int
    %18474 = "torch.symbolic_int"() <{max_val = 4095 : i64, min_val = 2 : i64, symbol_name = "s1"}> : () -> !torch.int
    %18475 = "torch.symbolic_int"() <{max_val = 9223372036854775807 : i64, min_val = 0 : i64, symbol_name = "s2"}> : () -> !torch.int
    "torch.bind_symbolic_shape"(%arg67, %18474) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    "torch.bind_symbolic_shape"(%arg69, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    "torch.bind_symbolic_shape"(%18472, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %18476 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18477 = "torch.aten.size.int"(%arg69, %18476) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.int
    %18478 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18479 = "torch.aten.size.int"(%18472, %18478) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> !torch.int
    %18480 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18481 = "torch.aten.size.int"(%arg67, %18480) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.int
    %18482 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18483 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18484 = "torch.constant.none"() : () -> !torch.none
    %18485 = "torch.constant.none"() : () -> !torch.none
    %18486 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18487 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18488 = "torch.aten.arange.start_step"(%18482, %18481, %18483, %18484, %18485, %18486, %18487) : (!torch.int, !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%18488, %18474) <{shape_expressions = #map3}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %18489 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18490 = "torch.aten.unsqueeze"(%arg68, %18489) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %18491 = "torch.aten.ge.Tensor"(%18488, %18490) : (!torch.vtensor<[?],si64>, !torch.vtensor<[4,1],si64>) -> !torch.vtensor<[4,?],i1>
    "torch.bind_symbolic_shape"(%18491, %18474) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],i1>, !torch.int) -> ()
    %18492 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18493 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18494 = "torch.prim.ListConstruct"(%18492, %18493) : (!torch.int, !torch.int) -> !torch.list<int>
    %18495 = "torch.constant.int"() <{value = 11 : i64}> : () -> !torch.int
    %18496 = "torch.constant.none"() : () -> !torch.none
    %18497 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18498 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18499 = "torch.aten.ones"(%18494, %18495, %18496, %18497, %18498) : (!torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[1,1],i1>
    %18500 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18501 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18502 = "torch.prim.ListConstruct"(%18500, %18501) : (!torch.int, !torch.int) -> !torch.list<int>
    %18503 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18504 = "torch.aten.expand"(%18499, %18502, %18503) : (!torch.vtensor<[1,1],i1>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[131072,131072],i1>
    %18505 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18506 = "torch.aten.triu"(%18504, %18505) : (!torch.vtensor<[131072,131072],i1>, !torch.int) -> !torch.vtensor<[131072,131072],i1>
    %18507 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18508 = "torch.aten.unsqueeze"(%18506, %18507) : (!torch.vtensor<[131072,131072],i1>, !torch.int) -> !torch.vtensor<[1,131072,131072],i1>
    %18509 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18510 = "torch.aten.unsqueeze"(%18508, %18509) : (!torch.vtensor<[1,131072,131072],i1>, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18511 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18512 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18513 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18514 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18515 = "torch.aten.slice.Tensor"(%18510, %18511, %18512, %18513, %18514) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18516 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18518 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18519 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18520 = "torch.aten.slice.Tensor"(%18515, %18516, %18517, %18518, %18519) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18522 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18523 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18524 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18525 = "torch.aten.slice.Tensor"(%18520, %18521, %18522, %18523, %18524) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18526 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18527 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18528 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18529 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18530 = "torch.aten.slice.Tensor"(%18525, %18526, %18527, %18528, %18529) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18531 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18532 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18534 = "torch.aten.slice.Tensor"(%18530, %18531, %18532, %18481, %18533) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,?,131072],i1>
    "torch.bind_symbolic_shape"(%18534, %18474) <{shape_expressions = #map4}> : (!torch.vtensor<[1,1,?,131072],i1>, !torch.int) -> ()
    %18535 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18536 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18537 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18538 = "torch.aten.slice.Tensor"(%18534, %18535, %18536, %18481, %18537) : (!torch.vtensor<[1,1,?,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,?,?],i1>
    "torch.bind_symbolic_shape"(%18538, %18474) <{shape_expressions = #map5}> : (!torch.vtensor<[1,1,?,?],i1>, !torch.int) -> ()
    %18539 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18540 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18541 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18543 = "torch.aten.slice.Tensor"(%18491, %18539, %18540, %18541, %18542) : (!torch.vtensor<[4,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?],i1>
    "torch.bind_symbolic_shape"(%18543, %18474) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],i1>, !torch.int) -> ()
    %18544 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18545 = "torch.aten.unsqueeze"(%18543, %18544) : (!torch.vtensor<[4,?],i1>, !torch.int) -> !torch.vtensor<[4,1,?],i1>
    "torch.bind_symbolic_shape"(%18545, %18474) <{shape_expressions = #map6}> : (!torch.vtensor<[4,1,?],i1>, !torch.int) -> ()
    %18546 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18547 = "torch.aten.unsqueeze"(%18545, %18546) : (!torch.vtensor<[4,1,?],i1>, !torch.int) -> !torch.vtensor<[4,1,1,?],i1>
    "torch.bind_symbolic_shape"(%18547, %18474) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],i1>, !torch.int) -> ()
    %18548 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18549 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18550 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18551 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18552 = "torch.aten.slice.Tensor"(%18547, %18548, %18549, %18550, %18551) : (!torch.vtensor<[4,1,1,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,1,1,?],i1>
    "torch.bind_symbolic_shape"(%18552, %18474) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],i1>, !torch.int) -> ()
    %18553 = "torch.aten.logical_or"(%18538, %18552) : (!torch.vtensor<[1,1,?,?],i1>, !torch.vtensor<[4,1,1,?],i1>) -> !torch.vtensor<[4,1,?,?],i1>
    "torch.bind_symbolic_shape"(%18553, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],i1>, !torch.int) -> ()
    %18554 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18555 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18556 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18557 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18558 = "torch.constant.none"() : () -> !torch.none
    %18559 = "torch.aten.scalar_tensor"(%18554, %18555, %18556, %18557, %18558) : (!torch.int, !torch.int, !torch.int, !torch.Device, !torch.none) -> !torch.vtensor<[],f32>
    %18560 = "torch.constant.float"() <{value = 0xFFF0000000000000 : f64}> : () -> !torch.float
    %18561 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18562 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18563 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18564 = "torch.constant.none"() : () -> !torch.none
    %18565 = "torch.aten.scalar_tensor"(%18560, %18561, %18562, %18563, %18564) : (!torch.float, !torch.int, !torch.int, !torch.Device, !torch.none) -> !torch.vtensor<[],f32>
    %18566 = "torch.aten.where.self"(%18553, %18565, %18559) : (!torch.vtensor<[4,1,?,?],i1>, !torch.vtensor<[],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,?,?],f32>
    "torch.bind_symbolic_shape"(%18566, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f32>, !torch.int) -> ()
    %18567 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18568 = "torch.prims.convert_element_type"(%18566, %18567) : (!torch.vtensor<[4,1,?,?],f32>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18568, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %18569 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18570 = "torch.prims.convert_element_type"(%18568, %18569) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18570, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %18571 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18572 = "torch.prims.convert_element_type"(%17187, %18571) : (!torch.vtensor<[128256,4096],bf16>, !torch.int) -> !torch.vtensor<[128256,4096],bf16>
    %18573 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18574 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18575 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18576 = "torch.aten.embedding"(%18572, %arg67, %18573, %18574, %18575) : (!torch.vtensor<[128256,4096],bf16>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18576, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18577 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18578 = "torch.prims.convert_element_type"(%18576, %18577) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18578, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18579 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18580 = "torch.aten.pow.Tensor_Scalar"(%18578, %18579) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18580, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18581 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18582 = "torch.prim.ListConstruct"(%18581) : (!torch.int) -> !torch.list<int>
    %18583 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %18584 = "torch.constant.none"() : () -> !torch.none
    %18585 = "torch.aten.mean.dim"(%18580, %18582, %18583, %18584) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%18585, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %18586 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %18587 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18588 = "torch.aten.add.Scalar"(%18585, %18586, %18587) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%18588, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %18589 = "torch.aten.rsqrt"(%18588) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%18589, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %18590 = "torch.aten.mul.Tensor"(%18578, %18589) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18590, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18591 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18592 = "torch.prims.convert_element_type"(%18590, %18591) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18592, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18593 = "torch.aten.mul.Tensor"(%17189, %18592) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18593, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18594 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18595 = "torch.prims.convert_element_type"(%18593, %18594) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18595, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18596 = "torch.aten.div.Tensor"(%18595, %17191) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18596, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18597 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18598 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18599 = "torch.aten.clamp"(%18596, %18597, %18598) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18599, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18600 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18601 = "torch.prims.convert_element_type"(%18599, %18600) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18601, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18602 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18603 = "torch.aten.unsqueeze"(%17193, %18602) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %18604 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18605 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18606 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18607 = "torch.prim.ListConstruct"(%18604, %18605, %18606) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18608 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18609 = "torch.aten.expand"(%18603, %18607, %18608) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %18610 = "torch_c.to_builtin_tensor"(%18601) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %18611 = "torch_c.to_builtin_tensor"(%18609) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %18612 = "util.call"(%18610, %18611) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %18613 = "torch_c.from_builtin_tensor"(%18612) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18613, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18614 = "torch.aten.div.Tensor"(%18613, %17195) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18614, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18615 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18616 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18617 = "torch.aten.clamp"(%18614, %18615, %18616) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18617, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18618 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18619 = "torch.prims.convert_element_type"(%18617, %18618) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18619, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18620 = "torch.aten.div.Tensor"(%18595, %17197) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18620, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18621 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18622 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18623 = "torch.aten.clamp"(%18620, %18621, %18622) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18623, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18624 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18625 = "torch.prims.convert_element_type"(%18623, %18624) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18625, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18626 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18627 = "torch.aten.unsqueeze"(%17199, %18626) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %18628 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18629 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %18630 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18631 = "torch.prim.ListConstruct"(%18628, %18629, %18630) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18632 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18633 = "torch.aten.expand"(%18627, %18631, %18632) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %18634 = "torch_c.to_builtin_tensor"(%18625) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %18635 = "torch_c.to_builtin_tensor"(%18633) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %18636 = "util.call"(%18634, %18635) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %18637 = "torch_c.from_builtin_tensor"(%18636) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18637, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18638 = "torch.aten.div.Tensor"(%18637, %17201) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18638, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18639 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18640 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18641 = "torch.aten.clamp"(%18638, %18639, %18640) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18641, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18642 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18643 = "torch.prims.convert_element_type"(%18641, %18642) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18643, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %18644 = "torch.aten.div.Tensor"(%18595, %17203) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18644, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18645 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18646 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18647 = "torch.aten.clamp"(%18644, %18645, %18646) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18647, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18648 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18649 = "torch.prims.convert_element_type"(%18647, %18648) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18649, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18650 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18651 = "torch.aten.unsqueeze"(%17205, %18650) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %18652 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18653 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %18654 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18655 = "torch.prim.ListConstruct"(%18652, %18653, %18654) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18656 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18657 = "torch.aten.expand"(%18651, %18655, %18656) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %18658 = "torch_c.to_builtin_tensor"(%18649) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %18659 = "torch_c.to_builtin_tensor"(%18657) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %18660 = "util.call"(%18658, %18659) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %18661 = "torch_c.from_builtin_tensor"(%18660) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18661, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18662 = "torch.aten.div.Tensor"(%18661, %17207) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18662, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18663 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18664 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18665 = "torch.aten.clamp"(%18662, %18663, %18664) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18665, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18666 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18667 = "torch.prims.convert_element_type"(%18665, %18666) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18667, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %18668 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18669 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18670 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18671 = "torch.prim.ListConstruct"(%18668, %18481, %18669, %18670) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18672 = "torch.aten.view"(%18619, %18671) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18672, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18673 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18674 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18675 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18676 = "torch.prim.ListConstruct"(%18673, %18481, %18674, %18675) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18677 = "torch.aten.view"(%18643, %18676) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18677, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18678 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18679 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18680 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18681 = "torch.prim.ListConstruct"(%18678, %18481, %18679, %18680) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18682 = "torch.aten.view"(%18667, %18681) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18682, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18683 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18684 = "torch.constant.none"() : () -> !torch.none
    %18685 = "torch.constant.none"() : () -> !torch.none
    %18686 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18687 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18688 = "torch.aten.arange"(%18683, %18684, %18685, %18686, %18687) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %18689 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18690 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18691 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18692 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18693 = "torch.constant.none"() : () -> !torch.none
    %18694 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18695 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18696 = "torch.aten.arange.start_step"(%18689, %18690, %18691, %18692, %18693, %18694, %18695) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %18697 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18698 = "torch.prims.convert_element_type"(%18696, %18697) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %18699 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18700 = "torch.aten.div.Scalar"(%18698, %18699) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18701 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %18702 = "torch.aten.pow.Scalar"(%18701, %18700) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18703 = "torch.aten.reciprocal"(%18702) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18704 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %18705 = "torch.aten.mul.Scalar"(%18703, %18704) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18706 = "torch.aten.reciprocal"(%18705) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18707 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %18708 = "torch.aten.mul.Scalar"(%18706, %18707) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18709 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18710 = "torch.aten.gt.Scalar"(%18708, %18709) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18711 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18712 = "torch.aten.div.Scalar"(%18705, %18711) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18713 = "torch.aten.where.self"(%18710, %18712, %18705) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18714 = "torch.aten.reciprocal"(%18708) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18715 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %18716 = "torch.aten.mul.Scalar"(%18714, %18715) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18717 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18718 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18719 = "torch.aten.sub.Scalar"(%18716, %18717, %18718) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18720 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18721 = "torch.aten.div.Scalar"(%18719, %18720) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18722 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18723 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18724 = "torch.aten.rsub.Scalar"(%18721, %18722, %18723) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18725 = "torch.aten.mul.Tensor"(%18724, %18713) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18726 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18727 = "torch.aten.div.Scalar"(%18725, %18726) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18728 = "torch.aten.mul.Tensor"(%18721, %18713) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18729 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18730 = "torch.aten.add.Tensor"(%18727, %18728, %18729) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18731 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %18732 = "torch.aten.lt.Scalar"(%18708, %18731) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18733 = "torch.aten.bitwise_not"(%18732) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18734 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18735 = "torch.aten.gt.Scalar"(%18708, %18734) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18736 = "torch.aten.bitwise_not"(%18735) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18737 = "torch.aten.mul.Tensor"(%18733, %18736) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18738 = "torch.aten.where.self"(%18737, %18730, %18713) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18739 = "torch.prim.ListConstruct"(%18738, %18738) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %18740 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18741 = "torch.aten.cat"(%18739, %18740) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %18742 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18743 = "torch.prims.convert_element_type"(%18688, %18742) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %18744 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18745 = "torch.prims.convert_element_type"(%18741, %18744) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %18746 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18747 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18748 = "torch.prim.ListConstruct"(%18746, %18747) : (!torch.int, !torch.int) -> !torch.list<int>
    %18749 = "torch.aten.view"(%18743, %18748) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %18750 = "torch.aten.mul.Tensor"(%18749, %18745) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18751 = "torch.aten.cos"(%18750) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18752 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18753 = "torch.prims.convert_element_type"(%18751, %18752) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18754 = "torch.aten.sin"(%18750) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18755 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18756 = "torch.prims.convert_element_type"(%18754, %18755) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18757 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18758 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18759 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18760 = "torch.aten.slice.Tensor"(%18753, %18757, %18758, %18481, %18759) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18760, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18761 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18762 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18763 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18764 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18765 = "torch.aten.slice.Tensor"(%18760, %18761, %18762, %18763, %18764) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18765, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18766 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18767 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18768 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18769 = "torch.aten.slice.Tensor"(%18756, %18766, %18767, %18481, %18768) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18769, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18770 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18771 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18772 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18773 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18774 = "torch.aten.slice.Tensor"(%18769, %18770, %18771, %18772, %18773) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18774, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18775 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18776 = "torch.aten.unsqueeze"(%18765, %18775) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18776, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18778 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18779 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18780 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18781 = "torch.aten.slice.Tensor"(%18776, %18777, %18778, %18779, %18780) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18781, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18782 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18783 = "torch.aten.unsqueeze"(%18781, %18782) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18783, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18784 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18785 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18786 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18787 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18788 = "torch.aten.slice.Tensor"(%18783, %18784, %18785, %18786, %18787) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18788, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18789 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18790 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18792 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18793 = "torch.prim.ListConstruct"(%18789, %18790, %18791, %18792) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18794 = "torch.aten.repeat"(%18788, %18793) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18794, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18795 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18796 = "torch.aten.unsqueeze"(%18774, %18795) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18796, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18797 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18798 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18799 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18800 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18801 = "torch.aten.slice.Tensor"(%18796, %18797, %18798, %18799, %18800) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18801, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18802 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18803 = "torch.aten.unsqueeze"(%18801, %18802) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18803, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18804 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18805 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18806 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18807 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18808 = "torch.aten.slice.Tensor"(%18803, %18804, %18805, %18806, %18807) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18808, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18809 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18810 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18811 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18812 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18813 = "torch.prim.ListConstruct"(%18809, %18810, %18811, %18812) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18814 = "torch.aten.repeat"(%18808, %18813) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18814, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18815 = "torch.aten.mul.Tensor"(%18672, %18794) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18815, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18816 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18817 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18818 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18819 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18820 = "torch.aten.slice.Tensor"(%18672, %18816, %18817, %18818, %18819) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18820, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18821 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18822 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18823 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18825 = "torch.aten.slice.Tensor"(%18672, %18821, %18822, %18823, %18824) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18825, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18826 = "torch.aten.neg"(%18825) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18826, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18827 = "torch.prim.ListConstruct"(%18826, %18820) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %18828 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18829 = "torch.aten.cat"(%18827, %18828) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18829, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18830 = "torch.aten.mul.Tensor"(%18829, %18814) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18830, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18831 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18832 = "torch.aten.add.Tensor"(%18815, %18830, %18831) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18832, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18833 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18834 = "torch.constant.none"() : () -> !torch.none
    %18835 = "torch.constant.none"() : () -> !torch.none
    %18836 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18837 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18838 = "torch.aten.arange"(%18833, %18834, %18835, %18836, %18837) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %18839 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18840 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18841 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18842 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18843 = "torch.constant.none"() : () -> !torch.none
    %18844 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18845 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18846 = "torch.aten.arange.start_step"(%18839, %18840, %18841, %18842, %18843, %18844, %18845) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %18847 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18848 = "torch.prims.convert_element_type"(%18846, %18847) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %18849 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18850 = "torch.aten.div.Scalar"(%18848, %18849) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18851 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %18852 = "torch.aten.pow.Scalar"(%18851, %18850) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18853 = "torch.aten.reciprocal"(%18852) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18854 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %18855 = "torch.aten.mul.Scalar"(%18853, %18854) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18856 = "torch.aten.reciprocal"(%18855) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18857 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %18858 = "torch.aten.mul.Scalar"(%18856, %18857) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18859 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18860 = "torch.aten.gt.Scalar"(%18858, %18859) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18861 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18862 = "torch.aten.div.Scalar"(%18855, %18861) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18863 = "torch.aten.where.self"(%18860, %18862, %18855) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18864 = "torch.aten.reciprocal"(%18858) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18865 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %18866 = "torch.aten.mul.Scalar"(%18864, %18865) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18867 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18868 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18869 = "torch.aten.sub.Scalar"(%18866, %18867, %18868) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18870 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18871 = "torch.aten.div.Scalar"(%18869, %18870) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18872 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18873 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18874 = "torch.aten.rsub.Scalar"(%18871, %18872, %18873) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18875 = "torch.aten.mul.Tensor"(%18874, %18863) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18876 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18877 = "torch.aten.div.Scalar"(%18875, %18876) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18878 = "torch.aten.mul.Tensor"(%18871, %18863) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18880 = "torch.aten.add.Tensor"(%18877, %18878, %18879) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18881 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %18882 = "torch.aten.lt.Scalar"(%18858, %18881) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18883 = "torch.aten.bitwise_not"(%18882) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18884 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18885 = "torch.aten.gt.Scalar"(%18858, %18884) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18886 = "torch.aten.bitwise_not"(%18885) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18887 = "torch.aten.mul.Tensor"(%18883, %18886) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18888 = "torch.aten.where.self"(%18887, %18880, %18863) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18889 = "torch.prim.ListConstruct"(%18888, %18888) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %18890 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18891 = "torch.aten.cat"(%18889, %18890) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %18892 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18893 = "torch.prims.convert_element_type"(%18838, %18892) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %18894 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18895 = "torch.prims.convert_element_type"(%18891, %18894) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %18896 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18897 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18898 = "torch.prim.ListConstruct"(%18896, %18897) : (!torch.int, !torch.int) -> !torch.list<int>
    %18899 = "torch.aten.view"(%18893, %18898) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %18900 = "torch.aten.mul.Tensor"(%18899, %18895) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18901 = "torch.aten.cos"(%18900) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18902 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18903 = "torch.prims.convert_element_type"(%18901, %18902) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18904 = "torch.aten.sin"(%18900) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18905 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18906 = "torch.prims.convert_element_type"(%18904, %18905) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18907 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18908 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18909 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18910 = "torch.aten.slice.Tensor"(%18903, %18907, %18908, %18481, %18909) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18910, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18911 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18913 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18915 = "torch.aten.slice.Tensor"(%18910, %18911, %18912, %18913, %18914) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18915, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18916 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18917 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18918 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18919 = "torch.aten.slice.Tensor"(%18906, %18916, %18917, %18481, %18918) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18919, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18920 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18921 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18922 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18923 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18924 = "torch.aten.slice.Tensor"(%18919, %18920, %18921, %18922, %18923) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18924, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18925 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18926 = "torch.aten.unsqueeze"(%18915, %18925) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18926, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18928 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18929 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18930 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18931 = "torch.aten.slice.Tensor"(%18926, %18927, %18928, %18929, %18930) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18931, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18932 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18933 = "torch.aten.unsqueeze"(%18931, %18932) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18933, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18934 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18935 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18936 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18937 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18938 = "torch.aten.slice.Tensor"(%18933, %18934, %18935, %18936, %18937) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18938, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18939 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18940 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18941 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18942 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18943 = "torch.prim.ListConstruct"(%18939, %18940, %18941, %18942) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18944 = "torch.aten.repeat"(%18938, %18943) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18944, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18945 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18946 = "torch.aten.unsqueeze"(%18924, %18945) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18946, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18947 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18948 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18949 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18950 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18951 = "torch.aten.slice.Tensor"(%18946, %18947, %18948, %18949, %18950) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18951, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18952 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18953 = "torch.aten.unsqueeze"(%18951, %18952) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18953, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18954 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18955 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18956 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18957 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18958 = "torch.aten.slice.Tensor"(%18953, %18954, %18955, %18956, %18957) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18958, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18960 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18961 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18962 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18963 = "torch.prim.ListConstruct"(%18959, %18960, %18961, %18962) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18964 = "torch.aten.repeat"(%18958, %18963) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18964, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18965 = "torch.aten.mul.Tensor"(%18677, %18944) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18965, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18966 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18967 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18968 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18969 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18970 = "torch.aten.slice.Tensor"(%18677, %18966, %18967, %18968, %18969) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18970, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18971 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18972 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18973 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18974 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18975 = "torch.aten.slice.Tensor"(%18677, %18971, %18972, %18973, %18974) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18975, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18976 = "torch.aten.neg"(%18975) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18976, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18977 = "torch.prim.ListConstruct"(%18976, %18970) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %18978 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18979 = "torch.aten.cat"(%18977, %18978) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18979, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18980 = "torch.aten.mul.Tensor"(%18979, %18964) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18980, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18981 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18982 = "torch.aten.add.Tensor"(%18965, %18980, %18981) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18982, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18983 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18984 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18985 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18986 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18987 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18988 = "torch.prim.ListConstruct"(%18479, %18983, %18984, %18985, %18986, %18987) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18989 = "torch.aten.view"(%18472, %18988) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18989, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18990 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18991 = "torch.aten.mul.int"(%18479, %18990) : (!torch.int, !torch.int) -> !torch.int
    %18992 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18993 = "torch.aten.mul.int"(%18991, %18992) : (!torch.int, !torch.int) -> !torch.int
    %18994 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18995 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18996 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18997 = "torch.prim.ListConstruct"(%18993, %18994, %18995, %18996) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18998 = "torch.aten.view"(%18989, %18997) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18998, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18999 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19000 = "torch.aten.mul.Scalar"(%arg69, %18999) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19000, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19001 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19002 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19003 = "torch.aten.add.Scalar"(%19000, %19001, %19002) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19003, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19004 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19005 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19006 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19007 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19008 = "torch.prim.ListConstruct"(%19004, %18477, %19005, %19006, %19007) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19009 = "torch.aten.view"(%18982, %19008) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19009, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19010 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19011 = "torch.aten.mul.int"(%19010, %18477) : (!torch.int, !torch.int) -> !torch.int
    %19012 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19013 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19014 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19015 = "torch.prim.ListConstruct"(%19011, %19012, %19013, %19014) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19016 = "torch.aten.view"(%19009, %19015) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19016, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19017 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19018 = "torch.aten.view"(%19003, %19017) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19018, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19019 = "torch.prim.ListConstruct"(%19018) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19020 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19021 = "torch.aten.index_put"(%18998, %19019, %19016, %19020) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19021, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19022 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19023 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19024 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19025 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19026 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19027 = "torch.prim.ListConstruct"(%18479, %19022, %19023, %19024, %19025, %19026) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19028 = "torch.aten.view"(%19021, %19027) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19028, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19029 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19030 = "torch.prim.ListConstruct"(%18479, %19029) : (!torch.int, !torch.int) -> !torch.list<int>
    %19031 = "torch.aten.view"(%19028, %19030) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19031, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19032 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19033 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19034 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19035 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19036 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19037 = "torch.prim.ListConstruct"(%18479, %19032, %19033, %19034, %19035, %19036) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19038 = "torch.aten.view"(%19031, %19037) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19038, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19039 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19040 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19041 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19042 = "torch.prim.ListConstruct"(%18993, %19039, %19040, %19041) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19043 = "torch.aten.view"(%19038, %19042) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19043, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19044 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19045 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19046 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19047 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19048 = "torch.prim.ListConstruct"(%19044, %18477, %19045, %19046, %19047) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19049 = "torch.aten.view"(%18682, %19048) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19049, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19050 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19051 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19052 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19053 = "torch.prim.ListConstruct"(%19011, %19050, %19051, %19052) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19054 = "torch.aten.view"(%19049, %19053) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19054, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19055 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19056 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19057 = "torch.aten.add.Scalar"(%19003, %19055, %19056) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19057, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19058 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19059 = "torch.aten.view"(%19057, %19058) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19059, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19060 = "torch.prim.ListConstruct"(%19059) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19061 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19062 = "torch.aten.index_put"(%19043, %19060, %19054, %19061) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19062, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19063 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19064 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19065 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19066 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19067 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19068 = "torch.prim.ListConstruct"(%18479, %19063, %19064, %19065, %19066, %19067) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19069 = "torch.aten.view"(%19062, %19068) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19069, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19070 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19071 = "torch.prim.ListConstruct"(%18479, %19070) : (!torch.int, !torch.int) -> !torch.list<int>
    %19072 = "torch.aten.view"(%19069, %19071) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19072, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19073 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19074 = "torch.aten.unsqueeze"(%18982, %19073) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19074, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19075 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19076 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19077 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19078 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19079 = "torch.prim.ListConstruct"(%19075, %18481, %19076, %19077, %19078) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19080 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19081 = "torch.aten.expand"(%19074, %19079, %19080) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19081, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19082 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19083 = "torch.aten.clone"(%19081, %19082) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19083, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19084 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19085 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19086 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19087 = "torch.prim.ListConstruct"(%19084, %18481, %19085, %19086) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19088 = "torch.aten._unsafe_view"(%19083, %19087) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19088, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19089 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19090 = "torch.aten.unsqueeze"(%18682, %19089) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19090, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19091 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19092 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19093 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19094 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19095 = "torch.prim.ListConstruct"(%19091, %18481, %19092, %19093, %19094) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19096 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19097 = "torch.aten.expand"(%19090, %19095, %19096) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19097, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19098 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19099 = "torch.aten.clone"(%19097, %19098) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19099, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19100 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19101 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19102 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19103 = "torch.prim.ListConstruct"(%19100, %18481, %19101, %19102) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19104 = "torch.aten._unsafe_view"(%19099, %19103) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19104, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19105 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19106 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19107 = "torch.aten.transpose.int"(%18832, %19105, %19106) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19107, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19108 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19109 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19110 = "torch.aten.transpose.int"(%19088, %19108, %19109) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19110, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19111 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19112 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19113 = "torch.aten.transpose.int"(%19104, %19111, %19112) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19113, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19114 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19115 = "torch.aten.squeeze.dim"(%18570, %19114) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19115, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19116 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19117 = "torch.aten.squeeze.dim"(%19115, %19116) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19117, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19118 = "torch_c.to_builtin_tensor"(%19107) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19119 = "torch_c.to_builtin_tensor"(%19110) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19120 = "torch_c.to_builtin_tensor"(%19113) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19121 = "torch_c.to_builtin_tensor"(%19117) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %19122 = "tensor.cast"(%19121) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %19123 = "torch_c.to_builtin_tensor"(%17209) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %19124 = "util.call"(%19118, %19119, %19120, %19123, %19122) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %19125 = "torch_c.from_builtin_tensor"(%19124) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%19125, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %19126 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19127 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19128 = "torch.aten.transpose.int"(%19125, %19126, %19127) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19128, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19129 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19130 = "torch.aten.clone"(%19128, %19129) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19130, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19131 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19132 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19133 = "torch.prim.ListConstruct"(%19131, %18481, %19132) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19134 = "torch.aten._unsafe_view"(%19130, %19133) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19134, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19135 = "torch.aten.div.Tensor"(%19134, %17211) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19135, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19136 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19137 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19138 = "torch.aten.clamp"(%19135, %19136, %19137) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19138, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19139 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19140 = "torch.prims.convert_element_type"(%19138, %19139) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19140, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19141 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19142 = "torch.aten.unsqueeze"(%17213, %19141) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19143 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19144 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19145 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19146 = "torch.prim.ListConstruct"(%19143, %19144, %19145) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19147 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19148 = "torch.aten.expand"(%19142, %19146, %19147) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19149 = "torch_c.to_builtin_tensor"(%19140) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19150 = "torch_c.to_builtin_tensor"(%19148) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19151 = "util.call"(%19149, %19150) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19152 = "torch_c.from_builtin_tensor"(%19151) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19152, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19153 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19154 = "torch.prims.convert_element_type"(%19152, %19153) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19154, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19155 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19156 = "torch.aten.add.Tensor"(%18576, %19154, %19155) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19156, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19157 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19158 = "torch.prims.convert_element_type"(%19156, %19157) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19158, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19159 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19160 = "torch.aten.pow.Tensor_Scalar"(%19158, %19159) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19160, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19161 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19162 = "torch.prim.ListConstruct"(%19161) : (!torch.int) -> !torch.list<int>
    %19163 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19164 = "torch.constant.none"() : () -> !torch.none
    %19165 = "torch.aten.mean.dim"(%19160, %19162, %19163, %19164) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19165, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19166 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19167 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19168 = "torch.aten.add.Scalar"(%19165, %19166, %19167) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19168, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19169 = "torch.aten.rsqrt"(%19168) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19169, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19170 = "torch.aten.mul.Tensor"(%19158, %19169) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19170, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19171 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19172 = "torch.prims.convert_element_type"(%19170, %19171) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19172, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19173 = "torch.aten.mul.Tensor"(%17215, %19172) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19173, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19174 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19175 = "torch.prims.convert_element_type"(%19173, %19174) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19175, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19176 = "torch.aten.div.Tensor"(%19175, %17217) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19176, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19177 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19178 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19179 = "torch.aten.clamp"(%19176, %19177, %19178) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19179, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19180 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19181 = "torch.prims.convert_element_type"(%19179, %19180) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19181, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19182 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19183 = "torch.aten.unsqueeze"(%17219, %19182) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19184 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19185 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19186 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19187 = "torch.prim.ListConstruct"(%19184, %19185, %19186) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19188 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19189 = "torch.aten.expand"(%19183, %19187, %19188) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19190 = "torch_c.to_builtin_tensor"(%19181) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19191 = "torch_c.to_builtin_tensor"(%19189) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19192 = "util.call"(%19190, %19191) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19193 = "torch_c.from_builtin_tensor"(%19192) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19193, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19194 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19195 = "torch.prims.convert_element_type"(%19193, %19194) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19195, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19196 = "torch.aten.silu"(%19195) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19196, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19197 = "torch.aten.div.Tensor"(%19175, %17221) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19197, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19198 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19199 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19200 = "torch.aten.clamp"(%19197, %19198, %19199) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19200, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19201 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19202 = "torch.prims.convert_element_type"(%19200, %19201) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19202, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19203 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19204 = "torch.aten.unsqueeze"(%17223, %19203) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19205 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19206 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19207 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19208 = "torch.prim.ListConstruct"(%19205, %19206, %19207) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19209 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19210 = "torch.aten.expand"(%19204, %19208, %19209) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19211 = "torch_c.to_builtin_tensor"(%19202) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19212 = "torch_c.to_builtin_tensor"(%19210) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19213 = "util.call"(%19211, %19212) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19214 = "torch_c.from_builtin_tensor"(%19213) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19214, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19215 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19216 = "torch.prims.convert_element_type"(%19214, %19215) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19216, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19217 = "torch.aten.mul.Tensor"(%19196, %19216) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19217, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19218 = "torch.aten.div.Tensor"(%19217, %17225) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19218, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19219 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19220 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19221 = "torch.aten.clamp"(%19218, %19219, %19220) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19221, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19222 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19223 = "torch.prims.convert_element_type"(%19221, %19222) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19223, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %19224 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19225 = "torch.aten.unsqueeze"(%17227, %19224) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %19226 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19227 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19228 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19229 = "torch.prim.ListConstruct"(%19226, %19227, %19228) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19230 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19231 = "torch.aten.expand"(%19225, %19229, %19230) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %19232 = "torch_c.to_builtin_tensor"(%19223) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %19233 = "torch_c.to_builtin_tensor"(%19231) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %19234 = "util.call"(%19232, %19233) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19235 = "torch_c.from_builtin_tensor"(%19234) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19235, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19236 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19237 = "torch.prims.convert_element_type"(%19235, %19236) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19237, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19239 = "torch.aten.add.Tensor"(%19156, %19237, %19238) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19239, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19240 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19241 = "torch.prims.convert_element_type"(%19239, %19240) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19241, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19242 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19243 = "torch.aten.pow.Tensor_Scalar"(%19241, %19242) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19243, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19244 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19245 = "torch.prim.ListConstruct"(%19244) : (!torch.int) -> !torch.list<int>
    %19246 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19247 = "torch.constant.none"() : () -> !torch.none
    %19248 = "torch.aten.mean.dim"(%19243, %19245, %19246, %19247) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19248, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19249 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19251 = "torch.aten.add.Scalar"(%19248, %19249, %19250) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19251, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19252 = "torch.aten.rsqrt"(%19251) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19252, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19253 = "torch.aten.mul.Tensor"(%19241, %19252) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19253, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19254 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19255 = "torch.prims.convert_element_type"(%19253, %19254) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19255, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19256 = "torch.aten.mul.Tensor"(%17229, %19255) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19256, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19257 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19258 = "torch.prims.convert_element_type"(%19256, %19257) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19258, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19259 = "torch.aten.div.Tensor"(%19258, %17231) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19259, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19260 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19261 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19262 = "torch.aten.clamp"(%19259, %19260, %19261) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19262, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19263 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19264 = "torch.prims.convert_element_type"(%19262, %19263) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19264, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19266 = "torch.aten.unsqueeze"(%17233, %19265) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19267 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19268 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19269 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19270 = "torch.prim.ListConstruct"(%19267, %19268, %19269) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19271 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19272 = "torch.aten.expand"(%19266, %19270, %19271) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19273 = "torch_c.to_builtin_tensor"(%19264) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19274 = "torch_c.to_builtin_tensor"(%19272) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19275 = "util.call"(%19273, %19274) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19276 = "torch_c.from_builtin_tensor"(%19275) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19276, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19277 = "torch.aten.div.Tensor"(%19276, %17235) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19277, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19278 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19279 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19280 = "torch.aten.clamp"(%19277, %19278, %19279) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19280, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19281 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19282 = "torch.prims.convert_element_type"(%19280, %19281) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19282, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19283 = "torch.aten.div.Tensor"(%19258, %17237) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19283, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19284 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19285 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19286 = "torch.aten.clamp"(%19283, %19284, %19285) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19286, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19287 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19288 = "torch.prims.convert_element_type"(%19286, %19287) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19288, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19289 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19290 = "torch.aten.unsqueeze"(%17239, %19289) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19291 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19292 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19293 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19294 = "torch.prim.ListConstruct"(%19291, %19292, %19293) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19295 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19296 = "torch.aten.expand"(%19290, %19294, %19295) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19297 = "torch_c.to_builtin_tensor"(%19288) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19298 = "torch_c.to_builtin_tensor"(%19296) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19299 = "util.call"(%19297, %19298) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19300 = "torch_c.from_builtin_tensor"(%19299) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19300, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19301 = "torch.aten.div.Tensor"(%19300, %17241) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19301, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19302 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19303 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19304 = "torch.aten.clamp"(%19301, %19302, %19303) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19304, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19305 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19306 = "torch.prims.convert_element_type"(%19304, %19305) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19306, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19307 = "torch.aten.div.Tensor"(%19258, %17243) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19307, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19308 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19309 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19310 = "torch.aten.clamp"(%19307, %19308, %19309) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19310, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19311 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19312 = "torch.prims.convert_element_type"(%19310, %19311) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19312, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19313 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19314 = "torch.aten.unsqueeze"(%17245, %19313) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19315 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19316 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19317 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19318 = "torch.prim.ListConstruct"(%19315, %19316, %19317) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19319 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19320 = "torch.aten.expand"(%19314, %19318, %19319) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19321 = "torch_c.to_builtin_tensor"(%19312) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19322 = "torch_c.to_builtin_tensor"(%19320) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19323 = "util.call"(%19321, %19322) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19324 = "torch_c.from_builtin_tensor"(%19323) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19324, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19325 = "torch.aten.div.Tensor"(%19324, %17247) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19325, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19326 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19327 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19328 = "torch.aten.clamp"(%19325, %19326, %19327) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19328, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19329 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19330 = "torch.prims.convert_element_type"(%19328, %19329) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19330, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19331 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19332 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19333 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19334 = "torch.prim.ListConstruct"(%19331, %18481, %19332, %19333) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19335 = "torch.aten.view"(%19282, %19334) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19335, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19336 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19337 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19338 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19339 = "torch.prim.ListConstruct"(%19336, %18481, %19337, %19338) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19340 = "torch.aten.view"(%19306, %19339) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19340, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19341 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19342 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19343 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19344 = "torch.prim.ListConstruct"(%19341, %18481, %19342, %19343) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19345 = "torch.aten.view"(%19330, %19344) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19345, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19346 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19347 = "torch.constant.none"() : () -> !torch.none
    %19348 = "torch.constant.none"() : () -> !torch.none
    %19349 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19350 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19351 = "torch.aten.arange"(%19346, %19347, %19348, %19349, %19350) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %19352 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19353 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19354 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19355 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19356 = "torch.constant.none"() : () -> !torch.none
    %19357 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19358 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19359 = "torch.aten.arange.start_step"(%19352, %19353, %19354, %19355, %19356, %19357, %19358) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %19360 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19361 = "torch.prims.convert_element_type"(%19359, %19360) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %19362 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19363 = "torch.aten.div.Scalar"(%19361, %19362) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19364 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %19365 = "torch.aten.pow.Scalar"(%19364, %19363) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19366 = "torch.aten.reciprocal"(%19365) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19367 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %19368 = "torch.aten.mul.Scalar"(%19366, %19367) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19369 = "torch.aten.reciprocal"(%19368) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19370 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %19371 = "torch.aten.mul.Scalar"(%19369, %19370) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19372 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19373 = "torch.aten.gt.Scalar"(%19371, %19372) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19374 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19375 = "torch.aten.div.Scalar"(%19368, %19374) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19376 = "torch.aten.where.self"(%19373, %19375, %19368) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19377 = "torch.aten.reciprocal"(%19371) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19378 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %19379 = "torch.aten.mul.Scalar"(%19377, %19378) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19380 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19381 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19382 = "torch.aten.sub.Scalar"(%19379, %19380, %19381) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19383 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19384 = "torch.aten.div.Scalar"(%19382, %19383) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19385 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19386 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19387 = "torch.aten.rsub.Scalar"(%19384, %19385, %19386) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19388 = "torch.aten.mul.Tensor"(%19387, %19376) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19389 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19390 = "torch.aten.div.Scalar"(%19388, %19389) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19391 = "torch.aten.mul.Tensor"(%19384, %19376) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19392 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19393 = "torch.aten.add.Tensor"(%19390, %19391, %19392) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19394 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %19395 = "torch.aten.lt.Scalar"(%19371, %19394) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19396 = "torch.aten.bitwise_not"(%19395) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19397 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19398 = "torch.aten.gt.Scalar"(%19371, %19397) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19399 = "torch.aten.bitwise_not"(%19398) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19400 = "torch.aten.mul.Tensor"(%19396, %19399) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19401 = "torch.aten.where.self"(%19400, %19393, %19376) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19402 = "torch.prim.ListConstruct"(%19401, %19401) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %19403 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19404 = "torch.aten.cat"(%19402, %19403) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %19405 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19406 = "torch.prims.convert_element_type"(%19351, %19405) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %19407 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19408 = "torch.prims.convert_element_type"(%19404, %19407) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %19409 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19410 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19411 = "torch.prim.ListConstruct"(%19409, %19410) : (!torch.int, !torch.int) -> !torch.list<int>
    %19412 = "torch.aten.view"(%19406, %19411) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %19413 = "torch.aten.mul.Tensor"(%19412, %19408) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19414 = "torch.aten.cos"(%19413) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19415 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19416 = "torch.prims.convert_element_type"(%19414, %19415) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19417 = "torch.aten.sin"(%19413) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19418 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19419 = "torch.prims.convert_element_type"(%19417, %19418) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19420 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19421 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19422 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19423 = "torch.aten.slice.Tensor"(%19416, %19420, %19421, %18481, %19422) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19423, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19424 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19425 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19426 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19427 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19428 = "torch.aten.slice.Tensor"(%19423, %19424, %19425, %19426, %19427) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19428, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19429 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19430 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19431 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19432 = "torch.aten.slice.Tensor"(%19419, %19429, %19430, %18481, %19431) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19432, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19433 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19434 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19435 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19437 = "torch.aten.slice.Tensor"(%19432, %19433, %19434, %19435, %19436) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19437, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19438 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19439 = "torch.aten.unsqueeze"(%19428, %19438) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19439, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19441 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19442 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19443 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19444 = "torch.aten.slice.Tensor"(%19439, %19440, %19441, %19442, %19443) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19444, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19445 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19446 = "torch.aten.unsqueeze"(%19444, %19445) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19446, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19447 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19448 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19449 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19450 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19451 = "torch.aten.slice.Tensor"(%19446, %19447, %19448, %19449, %19450) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19451, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19452 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19453 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19454 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19455 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19456 = "torch.prim.ListConstruct"(%19452, %19453, %19454, %19455) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19457 = "torch.aten.repeat"(%19451, %19456) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19457, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19458 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19459 = "torch.aten.unsqueeze"(%19437, %19458) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19459, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19460 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19461 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19462 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19463 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19464 = "torch.aten.slice.Tensor"(%19459, %19460, %19461, %19462, %19463) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19464, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19465 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19466 = "torch.aten.unsqueeze"(%19464, %19465) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19466, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19467 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19468 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19469 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19470 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19471 = "torch.aten.slice.Tensor"(%19466, %19467, %19468, %19469, %19470) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19471, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19472 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19473 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19474 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19475 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19476 = "torch.prim.ListConstruct"(%19472, %19473, %19474, %19475) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19477 = "torch.aten.repeat"(%19471, %19476) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19477, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19478 = "torch.aten.mul.Tensor"(%19335, %19457) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19478, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19479 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19480 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19481 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19482 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19483 = "torch.aten.slice.Tensor"(%19335, %19479, %19480, %19481, %19482) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19483, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19484 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19485 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19486 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19487 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19488 = "torch.aten.slice.Tensor"(%19335, %19484, %19485, %19486, %19487) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19488, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19489 = "torch.aten.neg"(%19488) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19489, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19490 = "torch.prim.ListConstruct"(%19489, %19483) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %19491 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19492 = "torch.aten.cat"(%19490, %19491) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19492, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19493 = "torch.aten.mul.Tensor"(%19492, %19477) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19493, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19494 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19495 = "torch.aten.add.Tensor"(%19478, %19493, %19494) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19495, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19496 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19497 = "torch.constant.none"() : () -> !torch.none
    %19498 = "torch.constant.none"() : () -> !torch.none
    %19499 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19500 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19501 = "torch.aten.arange"(%19496, %19497, %19498, %19499, %19500) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %19502 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19503 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19504 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19505 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19506 = "torch.constant.none"() : () -> !torch.none
    %19507 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19508 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19509 = "torch.aten.arange.start_step"(%19502, %19503, %19504, %19505, %19506, %19507, %19508) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %19510 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19511 = "torch.prims.convert_element_type"(%19509, %19510) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %19512 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19513 = "torch.aten.div.Scalar"(%19511, %19512) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19514 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %19515 = "torch.aten.pow.Scalar"(%19514, %19513) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19516 = "torch.aten.reciprocal"(%19515) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19517 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %19518 = "torch.aten.mul.Scalar"(%19516, %19517) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19519 = "torch.aten.reciprocal"(%19518) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19520 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %19521 = "torch.aten.mul.Scalar"(%19519, %19520) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19522 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19523 = "torch.aten.gt.Scalar"(%19521, %19522) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19524 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19525 = "torch.aten.div.Scalar"(%19518, %19524) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19526 = "torch.aten.where.self"(%19523, %19525, %19518) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19527 = "torch.aten.reciprocal"(%19521) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19528 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %19529 = "torch.aten.mul.Scalar"(%19527, %19528) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19530 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19531 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19532 = "torch.aten.sub.Scalar"(%19529, %19530, %19531) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19533 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19534 = "torch.aten.div.Scalar"(%19532, %19533) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19537 = "torch.aten.rsub.Scalar"(%19534, %19535, %19536) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19538 = "torch.aten.mul.Tensor"(%19537, %19526) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19539 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19540 = "torch.aten.div.Scalar"(%19538, %19539) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19541 = "torch.aten.mul.Tensor"(%19534, %19526) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19543 = "torch.aten.add.Tensor"(%19540, %19541, %19542) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19544 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %19545 = "torch.aten.lt.Scalar"(%19521, %19544) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19546 = "torch.aten.bitwise_not"(%19545) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19547 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19548 = "torch.aten.gt.Scalar"(%19521, %19547) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19549 = "torch.aten.bitwise_not"(%19548) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19550 = "torch.aten.mul.Tensor"(%19546, %19549) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19551 = "torch.aten.where.self"(%19550, %19543, %19526) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19552 = "torch.prim.ListConstruct"(%19551, %19551) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %19553 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19554 = "torch.aten.cat"(%19552, %19553) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %19555 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19556 = "torch.prims.convert_element_type"(%19501, %19555) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %19557 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19558 = "torch.prims.convert_element_type"(%19554, %19557) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %19559 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19560 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19561 = "torch.prim.ListConstruct"(%19559, %19560) : (!torch.int, !torch.int) -> !torch.list<int>
    %19562 = "torch.aten.view"(%19556, %19561) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %19563 = "torch.aten.mul.Tensor"(%19562, %19558) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19564 = "torch.aten.cos"(%19563) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19565 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19566 = "torch.prims.convert_element_type"(%19564, %19565) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19567 = "torch.aten.sin"(%19563) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19568 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19569 = "torch.prims.convert_element_type"(%19567, %19568) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19570 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19571 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19572 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19573 = "torch.aten.slice.Tensor"(%19566, %19570, %19571, %18481, %19572) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19573, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19574 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19575 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19576 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19577 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19578 = "torch.aten.slice.Tensor"(%19573, %19574, %19575, %19576, %19577) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19578, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19580 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19581 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19582 = "torch.aten.slice.Tensor"(%19569, %19579, %19580, %18481, %19581) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19582, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19583 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19584 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19585 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19586 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19587 = "torch.aten.slice.Tensor"(%19582, %19583, %19584, %19585, %19586) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19587, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19588 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19589 = "torch.aten.unsqueeze"(%19578, %19588) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19589, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19590 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19591 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19592 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19593 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19594 = "torch.aten.slice.Tensor"(%19589, %19590, %19591, %19592, %19593) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19594, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19595 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19596 = "torch.aten.unsqueeze"(%19594, %19595) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19596, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19597 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19598 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19599 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19600 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19601 = "torch.aten.slice.Tensor"(%19596, %19597, %19598, %19599, %19600) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19601, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19602 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19603 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19604 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19605 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19606 = "torch.prim.ListConstruct"(%19602, %19603, %19604, %19605) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19607 = "torch.aten.repeat"(%19601, %19606) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19607, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19608 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19609 = "torch.aten.unsqueeze"(%19587, %19608) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19609, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19610 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19611 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19612 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19613 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19614 = "torch.aten.slice.Tensor"(%19609, %19610, %19611, %19612, %19613) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19614, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19615 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19616 = "torch.aten.unsqueeze"(%19614, %19615) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19616, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19617 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19618 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19619 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19621 = "torch.aten.slice.Tensor"(%19616, %19617, %19618, %19619, %19620) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19621, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19622 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19623 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19624 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19625 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19626 = "torch.prim.ListConstruct"(%19622, %19623, %19624, %19625) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19627 = "torch.aten.repeat"(%19621, %19626) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19627, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19628 = "torch.aten.mul.Tensor"(%19340, %19607) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19628, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19629 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19630 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19631 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19632 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19633 = "torch.aten.slice.Tensor"(%19340, %19629, %19630, %19631, %19632) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19633, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19634 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19635 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19636 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19637 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19638 = "torch.aten.slice.Tensor"(%19340, %19634, %19635, %19636, %19637) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19638, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19639 = "torch.aten.neg"(%19638) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19639, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19640 = "torch.prim.ListConstruct"(%19639, %19633) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %19641 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19642 = "torch.aten.cat"(%19640, %19641) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19642, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19643 = "torch.aten.mul.Tensor"(%19642, %19627) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19643, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19644 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19645 = "torch.aten.add.Tensor"(%19628, %19643, %19644) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19645, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19646 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19647 = "torch.aten.mul.Scalar"(%arg69, %19646) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19647, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19648 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19649 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19650 = "torch.aten.add.Scalar"(%19647, %19648, %19649) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19650, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19651 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19652 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19653 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19654 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19655 = "torch.prim.ListConstruct"(%19651, %18477, %19652, %19653, %19654) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19656 = "torch.aten.view"(%19645, %19655) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19656, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19657 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19658 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19659 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19660 = "torch.prim.ListConstruct"(%19011, %19657, %19658, %19659) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19661 = "torch.aten.view"(%19656, %19660) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19661, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19662 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19663 = "torch.aten.view"(%19650, %19662) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19663, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19664 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19665 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19666 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19667 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19668 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19669 = "torch.prim.ListConstruct"(%18479, %19664, %19665, %19666, %19667, %19668) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19670 = "torch.aten.view"(%19072, %19669) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19670, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19671 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19672 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19673 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19674 = "torch.prim.ListConstruct"(%18993, %19671, %19672, %19673) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19675 = "torch.aten.view"(%19670, %19674) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19675, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19676 = "torch.prim.ListConstruct"(%19663) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19677 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19678 = "torch.aten.index_put"(%19675, %19676, %19661, %19677) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19678, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19679 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19680 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19681 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19682 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19683 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19684 = "torch.prim.ListConstruct"(%18479, %19679, %19680, %19681, %19682, %19683) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19685 = "torch.aten.view"(%19678, %19684) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19685, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19686 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19687 = "torch.prim.ListConstruct"(%18479, %19686) : (!torch.int, !torch.int) -> !torch.list<int>
    %19688 = "torch.aten.view"(%19685, %19687) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19688, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19689 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19690 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19691 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19692 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19693 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19694 = "torch.prim.ListConstruct"(%18479, %19689, %19690, %19691, %19692, %19693) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19695 = "torch.aten.view"(%19688, %19694) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19695, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19696 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19697 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19698 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19699 = "torch.prim.ListConstruct"(%18993, %19696, %19697, %19698) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19700 = "torch.aten.view"(%19695, %19699) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19700, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19701 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19702 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19703 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19704 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19705 = "torch.prim.ListConstruct"(%19701, %18477, %19702, %19703, %19704) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19706 = "torch.aten.view"(%19345, %19705) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19706, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19707 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19708 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19709 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19710 = "torch.prim.ListConstruct"(%19011, %19707, %19708, %19709) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19711 = "torch.aten.view"(%19706, %19710) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19711, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19712 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19713 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19714 = "torch.aten.add.Scalar"(%19650, %19712, %19713) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19714, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19715 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19716 = "torch.aten.view"(%19714, %19715) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19716, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19717 = "torch.prim.ListConstruct"(%19716) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19718 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19719 = "torch.aten.index_put"(%19700, %19717, %19711, %19718) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19719, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19720 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19721 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19722 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19723 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19724 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19725 = "torch.prim.ListConstruct"(%18479, %19720, %19721, %19722, %19723, %19724) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19726 = "torch.aten.view"(%19719, %19725) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19726, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19727 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19728 = "torch.prim.ListConstruct"(%18479, %19727) : (!torch.int, !torch.int) -> !torch.list<int>
    %19729 = "torch.aten.view"(%19726, %19728) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19729, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19730 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19731 = "torch.aten.unsqueeze"(%19645, %19730) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19731, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19732 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19733 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19734 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19735 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19736 = "torch.prim.ListConstruct"(%19732, %18481, %19733, %19734, %19735) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19737 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19738 = "torch.aten.expand"(%19731, %19736, %19737) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19738, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19739 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19740 = "torch.aten.clone"(%19738, %19739) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19740, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19741 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19742 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19743 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19744 = "torch.prim.ListConstruct"(%19741, %18481, %19742, %19743) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19745 = "torch.aten._unsafe_view"(%19740, %19744) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19745, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19746 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19747 = "torch.aten.unsqueeze"(%19345, %19746) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19747, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19748 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19749 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19750 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19751 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19752 = "torch.prim.ListConstruct"(%19748, %18481, %19749, %19750, %19751) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19753 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19754 = "torch.aten.expand"(%19747, %19752, %19753) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19754, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19755 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19756 = "torch.aten.clone"(%19754, %19755) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19756, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19757 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19758 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19759 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19760 = "torch.prim.ListConstruct"(%19757, %18481, %19758, %19759) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19761 = "torch.aten._unsafe_view"(%19756, %19760) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19761, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19763 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19764 = "torch.aten.transpose.int"(%19495, %19762, %19763) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19764, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19765 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19766 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19767 = "torch.aten.transpose.int"(%19745, %19765, %19766) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19767, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19768 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19769 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19770 = "torch.aten.transpose.int"(%19761, %19768, %19769) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19770, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19771 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19772 = "torch.aten.squeeze.dim"(%18570, %19771) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19772, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19773 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19774 = "torch.aten.squeeze.dim"(%19772, %19773) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19774, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19775 = "torch_c.to_builtin_tensor"(%19764) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19776 = "torch_c.to_builtin_tensor"(%19767) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19777 = "torch_c.to_builtin_tensor"(%19770) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19778 = "torch_c.to_builtin_tensor"(%19774) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %19779 = "tensor.cast"(%19778) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %19780 = "torch_c.to_builtin_tensor"(%17249) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %19781 = "util.call"(%19775, %19776, %19777, %19780, %19779) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %19782 = "torch_c.from_builtin_tensor"(%19781) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%19782, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %19783 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19784 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19785 = "torch.aten.transpose.int"(%19782, %19783, %19784) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19785, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19786 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19787 = "torch.aten.clone"(%19785, %19786) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19787, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19788 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19789 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19790 = "torch.prim.ListConstruct"(%19788, %18481, %19789) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19791 = "torch.aten._unsafe_view"(%19787, %19790) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19791, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19792 = "torch.aten.div.Tensor"(%19791, %17251) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19792, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19793 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19794 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19795 = "torch.aten.clamp"(%19792, %19793, %19794) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19795, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19796 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19797 = "torch.prims.convert_element_type"(%19795, %19796) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19797, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19798 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19799 = "torch.aten.unsqueeze"(%17253, %19798) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19800 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19801 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19802 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19803 = "torch.prim.ListConstruct"(%19800, %19801, %19802) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19804 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19805 = "torch.aten.expand"(%19799, %19803, %19804) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19806 = "torch_c.to_builtin_tensor"(%19797) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19807 = "torch_c.to_builtin_tensor"(%19805) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19808 = "util.call"(%19806, %19807) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19809 = "torch_c.from_builtin_tensor"(%19808) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19809, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19810 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19811 = "torch.prims.convert_element_type"(%19809, %19810) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19811, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19812 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19813 = "torch.aten.add.Tensor"(%19239, %19811, %19812) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19813, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19814 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19815 = "torch.prims.convert_element_type"(%19813, %19814) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19815, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19816 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19817 = "torch.aten.pow.Tensor_Scalar"(%19815, %19816) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19817, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19818 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19819 = "torch.prim.ListConstruct"(%19818) : (!torch.int) -> !torch.list<int>
    %19820 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19821 = "torch.constant.none"() : () -> !torch.none
    %19822 = "torch.aten.mean.dim"(%19817, %19819, %19820, %19821) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19822, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19823 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19825 = "torch.aten.add.Scalar"(%19822, %19823, %19824) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19825, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19826 = "torch.aten.rsqrt"(%19825) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19826, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19827 = "torch.aten.mul.Tensor"(%19815, %19826) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19827, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19828 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19829 = "torch.prims.convert_element_type"(%19827, %19828) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19829, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19830 = "torch.aten.mul.Tensor"(%17255, %19829) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19830, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19831 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19832 = "torch.prims.convert_element_type"(%19830, %19831) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19832, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19833 = "torch.aten.div.Tensor"(%19832, %17257) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19833, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19834 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19835 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19836 = "torch.aten.clamp"(%19833, %19834, %19835) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19836, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19837 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19838 = "torch.prims.convert_element_type"(%19836, %19837) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19838, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19839 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19840 = "torch.aten.unsqueeze"(%17259, %19839) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19841 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19842 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19843 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19844 = "torch.prim.ListConstruct"(%19841, %19842, %19843) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19845 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19846 = "torch.aten.expand"(%19840, %19844, %19845) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19847 = "torch_c.to_builtin_tensor"(%19838) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19848 = "torch_c.to_builtin_tensor"(%19846) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19849 = "util.call"(%19847, %19848) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19850 = "torch_c.from_builtin_tensor"(%19849) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19850, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19851 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19852 = "torch.prims.convert_element_type"(%19850, %19851) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19852, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19853 = "torch.aten.silu"(%19852) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19853, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19854 = "torch.aten.div.Tensor"(%19832, %17261) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19854, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19855 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19856 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19857 = "torch.aten.clamp"(%19854, %19855, %19856) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19857, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19858 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19859 = "torch.prims.convert_element_type"(%19857, %19858) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19859, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19860 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19861 = "torch.aten.unsqueeze"(%17263, %19860) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19862 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19863 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19864 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19865 = "torch.prim.ListConstruct"(%19862, %19863, %19864) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19866 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19867 = "torch.aten.expand"(%19861, %19865, %19866) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19868 = "torch_c.to_builtin_tensor"(%19859) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19869 = "torch_c.to_builtin_tensor"(%19867) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19870 = "util.call"(%19868, %19869) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19871 = "torch_c.from_builtin_tensor"(%19870) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19871, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19872 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19873 = "torch.prims.convert_element_type"(%19871, %19872) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19873, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19874 = "torch.aten.mul.Tensor"(%19853, %19873) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19874, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19875 = "torch.aten.div.Tensor"(%19874, %17265) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19875, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19876 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19877 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19878 = "torch.aten.clamp"(%19875, %19876, %19877) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19878, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19879 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19880 = "torch.prims.convert_element_type"(%19878, %19879) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19880, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %19881 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19882 = "torch.aten.unsqueeze"(%17267, %19881) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %19883 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19884 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19885 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19886 = "torch.prim.ListConstruct"(%19883, %19884, %19885) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19887 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19888 = "torch.aten.expand"(%19882, %19886, %19887) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %19889 = "torch_c.to_builtin_tensor"(%19880) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %19890 = "torch_c.to_builtin_tensor"(%19888) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %19891 = "util.call"(%19889, %19890) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19892 = "torch_c.from_builtin_tensor"(%19891) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19892, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19893 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19894 = "torch.prims.convert_element_type"(%19892, %19893) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19894, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19895 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19896 = "torch.aten.add.Tensor"(%19813, %19894, %19895) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19896, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19897 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19898 = "torch.prims.convert_element_type"(%19896, %19897) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19898, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19899 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19900 = "torch.aten.pow.Tensor_Scalar"(%19898, %19899) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19900, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19901 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19902 = "torch.prim.ListConstruct"(%19901) : (!torch.int) -> !torch.list<int>
    %19903 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19904 = "torch.constant.none"() : () -> !torch.none
    %19905 = "torch.aten.mean.dim"(%19900, %19902, %19903, %19904) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19905, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19906 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19908 = "torch.aten.add.Scalar"(%19905, %19906, %19907) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19908, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19909 = "torch.aten.rsqrt"(%19908) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19909, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19910 = "torch.aten.mul.Tensor"(%19898, %19909) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19910, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19911 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19912 = "torch.prims.convert_element_type"(%19910, %19911) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19912, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19913 = "torch.aten.mul.Tensor"(%17269, %19912) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19913, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19914 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19915 = "torch.prims.convert_element_type"(%19913, %19914) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19915, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19916 = "torch.aten.div.Tensor"(%19915, %17271) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19916, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19917 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19918 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19919 = "torch.aten.clamp"(%19916, %19917, %19918) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19919, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19920 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19921 = "torch.prims.convert_element_type"(%19919, %19920) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19921, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19922 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19923 = "torch.aten.unsqueeze"(%17273, %19922) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19924 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19925 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19926 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19927 = "torch.prim.ListConstruct"(%19924, %19925, %19926) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19928 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19929 = "torch.aten.expand"(%19923, %19927, %19928) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19930 = "torch_c.to_builtin_tensor"(%19921) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19931 = "torch_c.to_builtin_tensor"(%19929) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19932 = "util.call"(%19930, %19931) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19933 = "torch_c.from_builtin_tensor"(%19932) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19933, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19934 = "torch.aten.div.Tensor"(%19933, %17275) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19934, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19935 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19936 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19937 = "torch.aten.clamp"(%19934, %19935, %19936) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19937, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19938 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19939 = "torch.prims.convert_element_type"(%19937, %19938) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19939, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19940 = "torch.aten.div.Tensor"(%19915, %17277) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19940, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19941 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19942 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19943 = "torch.aten.clamp"(%19940, %19941, %19942) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19943, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19944 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19945 = "torch.prims.convert_element_type"(%19943, %19944) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19945, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19946 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19947 = "torch.aten.unsqueeze"(%17279, %19946) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19948 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19949 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19950 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19951 = "torch.prim.ListConstruct"(%19948, %19949, %19950) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19952 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19953 = "torch.aten.expand"(%19947, %19951, %19952) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19954 = "torch_c.to_builtin_tensor"(%19945) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19955 = "torch_c.to_builtin_tensor"(%19953) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19956 = "util.call"(%19954, %19955) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19957 = "torch_c.from_builtin_tensor"(%19956) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19957, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19958 = "torch.aten.div.Tensor"(%19957, %17281) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19958, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19959 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19960 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19961 = "torch.aten.clamp"(%19958, %19959, %19960) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19961, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19962 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19963 = "torch.prims.convert_element_type"(%19961, %19962) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19963, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19964 = "torch.aten.div.Tensor"(%19915, %17283) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19964, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19965 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19966 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19967 = "torch.aten.clamp"(%19964, %19965, %19966) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19967, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19968 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19969 = "torch.prims.convert_element_type"(%19967, %19968) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19969, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19970 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19971 = "torch.aten.unsqueeze"(%17285, %19970) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19972 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19973 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19974 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19975 = "torch.prim.ListConstruct"(%19972, %19973, %19974) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19976 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19977 = "torch.aten.expand"(%19971, %19975, %19976) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19978 = "torch_c.to_builtin_tensor"(%19969) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19979 = "torch_c.to_builtin_tensor"(%19977) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19980 = "util.call"(%19978, %19979) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19981 = "torch_c.from_builtin_tensor"(%19980) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19981, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19982 = "torch.aten.div.Tensor"(%19981, %17287) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19982, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19983 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19984 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19985 = "torch.aten.clamp"(%19982, %19983, %19984) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19985, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19986 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19987 = "torch.prims.convert_element_type"(%19985, %19986) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19987, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19988 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19989 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19990 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19991 = "torch.prim.ListConstruct"(%19988, %18481, %19989, %19990) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19992 = "torch.aten.view"(%19939, %19991) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19992, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19993 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19994 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19995 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19996 = "torch.prim.ListConstruct"(%19993, %18481, %19994, %19995) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19997 = "torch.aten.view"(%19963, %19996) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19997, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19998 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19999 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20000 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20001 = "torch.prim.ListConstruct"(%19998, %18481, %19999, %20000) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20002 = "torch.aten.view"(%19987, %20001) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20002, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20003 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20004 = "torch.constant.none"() : () -> !torch.none
    %20005 = "torch.constant.none"() : () -> !torch.none
    %20006 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20007 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20008 = "torch.aten.arange"(%20003, %20004, %20005, %20006, %20007) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20009 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20010 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20011 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20012 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20013 = "torch.constant.none"() : () -> !torch.none
    %20014 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20015 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20016 = "torch.aten.arange.start_step"(%20009, %20010, %20011, %20012, %20013, %20014, %20015) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20017 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20018 = "torch.prims.convert_element_type"(%20016, %20017) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20019 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20020 = "torch.aten.div.Scalar"(%20018, %20019) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20021 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20022 = "torch.aten.pow.Scalar"(%20021, %20020) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20023 = "torch.aten.reciprocal"(%20022) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20024 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20025 = "torch.aten.mul.Scalar"(%20023, %20024) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20026 = "torch.aten.reciprocal"(%20025) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20027 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20028 = "torch.aten.mul.Scalar"(%20026, %20027) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20029 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20030 = "torch.aten.gt.Scalar"(%20028, %20029) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20031 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20032 = "torch.aten.div.Scalar"(%20025, %20031) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20033 = "torch.aten.where.self"(%20030, %20032, %20025) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20034 = "torch.aten.reciprocal"(%20028) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20035 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20036 = "torch.aten.mul.Scalar"(%20034, %20035) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20037 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20038 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20039 = "torch.aten.sub.Scalar"(%20036, %20037, %20038) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20040 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20041 = "torch.aten.div.Scalar"(%20039, %20040) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20042 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20043 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20044 = "torch.aten.rsub.Scalar"(%20041, %20042, %20043) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20045 = "torch.aten.mul.Tensor"(%20044, %20033) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20046 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20047 = "torch.aten.div.Scalar"(%20045, %20046) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20048 = "torch.aten.mul.Tensor"(%20041, %20033) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20049 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20050 = "torch.aten.add.Tensor"(%20047, %20048, %20049) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20051 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20052 = "torch.aten.lt.Scalar"(%20028, %20051) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20053 = "torch.aten.bitwise_not"(%20052) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20054 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20055 = "torch.aten.gt.Scalar"(%20028, %20054) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20056 = "torch.aten.bitwise_not"(%20055) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20057 = "torch.aten.mul.Tensor"(%20053, %20056) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20058 = "torch.aten.where.self"(%20057, %20050, %20033) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20059 = "torch.prim.ListConstruct"(%20058, %20058) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20060 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20061 = "torch.aten.cat"(%20059, %20060) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20062 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20063 = "torch.prims.convert_element_type"(%20008, %20062) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20064 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20065 = "torch.prims.convert_element_type"(%20061, %20064) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20066 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20067 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20068 = "torch.prim.ListConstruct"(%20066, %20067) : (!torch.int, !torch.int) -> !torch.list<int>
    %20069 = "torch.aten.view"(%20063, %20068) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20070 = "torch.aten.mul.Tensor"(%20069, %20065) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20071 = "torch.aten.cos"(%20070) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20072 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20073 = "torch.prims.convert_element_type"(%20071, %20072) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20074 = "torch.aten.sin"(%20070) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20075 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20076 = "torch.prims.convert_element_type"(%20074, %20075) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20077 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20078 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20079 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20080 = "torch.aten.slice.Tensor"(%20073, %20077, %20078, %18481, %20079) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20080, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20081 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20082 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20083 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20084 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20085 = "torch.aten.slice.Tensor"(%20080, %20081, %20082, %20083, %20084) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20085, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20086 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20087 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20088 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20089 = "torch.aten.slice.Tensor"(%20076, %20086, %20087, %18481, %20088) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20089, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20090 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20091 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20092 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20093 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20094 = "torch.aten.slice.Tensor"(%20089, %20090, %20091, %20092, %20093) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20094, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20095 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20096 = "torch.aten.unsqueeze"(%20085, %20095) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20096, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20097 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20098 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20099 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20100 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20101 = "torch.aten.slice.Tensor"(%20096, %20097, %20098, %20099, %20100) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20101, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20102 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20103 = "torch.aten.unsqueeze"(%20101, %20102) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20103, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20104 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20105 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20106 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20107 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20108 = "torch.aten.slice.Tensor"(%20103, %20104, %20105, %20106, %20107) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20108, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20109 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20110 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20111 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20112 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20113 = "torch.prim.ListConstruct"(%20109, %20110, %20111, %20112) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20114 = "torch.aten.repeat"(%20108, %20113) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20114, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20115 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20116 = "torch.aten.unsqueeze"(%20094, %20115) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20116, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20117 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20118 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20119 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20120 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20121 = "torch.aten.slice.Tensor"(%20116, %20117, %20118, %20119, %20120) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20121, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20122 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20123 = "torch.aten.unsqueeze"(%20121, %20122) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20123, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20124 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20125 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20126 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20127 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20128 = "torch.aten.slice.Tensor"(%20123, %20124, %20125, %20126, %20127) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20128, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20129 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20130 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20131 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20132 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20133 = "torch.prim.ListConstruct"(%20129, %20130, %20131, %20132) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20134 = "torch.aten.repeat"(%20128, %20133) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20134, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20135 = "torch.aten.mul.Tensor"(%19992, %20114) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20135, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20136 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20137 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20138 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20139 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20140 = "torch.aten.slice.Tensor"(%19992, %20136, %20137, %20138, %20139) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20140, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20141 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20142 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20143 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20144 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20145 = "torch.aten.slice.Tensor"(%19992, %20141, %20142, %20143, %20144) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20145, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20146 = "torch.aten.neg"(%20145) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20146, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20147 = "torch.prim.ListConstruct"(%20146, %20140) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20148 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20149 = "torch.aten.cat"(%20147, %20148) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20149, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20150 = "torch.aten.mul.Tensor"(%20149, %20134) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20150, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20151 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20152 = "torch.aten.add.Tensor"(%20135, %20150, %20151) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20152, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20153 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20154 = "torch.constant.none"() : () -> !torch.none
    %20155 = "torch.constant.none"() : () -> !torch.none
    %20156 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20157 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20158 = "torch.aten.arange"(%20153, %20154, %20155, %20156, %20157) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20159 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20160 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20161 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20162 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20163 = "torch.constant.none"() : () -> !torch.none
    %20164 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20165 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20166 = "torch.aten.arange.start_step"(%20159, %20160, %20161, %20162, %20163, %20164, %20165) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20167 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20168 = "torch.prims.convert_element_type"(%20166, %20167) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20169 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20170 = "torch.aten.div.Scalar"(%20168, %20169) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20171 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20172 = "torch.aten.pow.Scalar"(%20171, %20170) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20173 = "torch.aten.reciprocal"(%20172) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20174 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20175 = "torch.aten.mul.Scalar"(%20173, %20174) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20176 = "torch.aten.reciprocal"(%20175) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20177 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20178 = "torch.aten.mul.Scalar"(%20176, %20177) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20179 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20180 = "torch.aten.gt.Scalar"(%20178, %20179) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20181 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20182 = "torch.aten.div.Scalar"(%20175, %20181) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20183 = "torch.aten.where.self"(%20180, %20182, %20175) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20184 = "torch.aten.reciprocal"(%20178) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20185 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20186 = "torch.aten.mul.Scalar"(%20184, %20185) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20187 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20188 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20189 = "torch.aten.sub.Scalar"(%20186, %20187, %20188) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20190 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20191 = "torch.aten.div.Scalar"(%20189, %20190) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20193 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20194 = "torch.aten.rsub.Scalar"(%20191, %20192, %20193) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20195 = "torch.aten.mul.Tensor"(%20194, %20183) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20196 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20197 = "torch.aten.div.Scalar"(%20195, %20196) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20198 = "torch.aten.mul.Tensor"(%20191, %20183) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20199 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20200 = "torch.aten.add.Tensor"(%20197, %20198, %20199) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20201 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20202 = "torch.aten.lt.Scalar"(%20178, %20201) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20203 = "torch.aten.bitwise_not"(%20202) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20204 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20205 = "torch.aten.gt.Scalar"(%20178, %20204) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20206 = "torch.aten.bitwise_not"(%20205) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20207 = "torch.aten.mul.Tensor"(%20203, %20206) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20208 = "torch.aten.where.self"(%20207, %20200, %20183) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20209 = "torch.prim.ListConstruct"(%20208, %20208) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20210 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20211 = "torch.aten.cat"(%20209, %20210) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20212 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20213 = "torch.prims.convert_element_type"(%20158, %20212) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20214 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20215 = "torch.prims.convert_element_type"(%20211, %20214) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20216 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20217 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20218 = "torch.prim.ListConstruct"(%20216, %20217) : (!torch.int, !torch.int) -> !torch.list<int>
    %20219 = "torch.aten.view"(%20213, %20218) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20220 = "torch.aten.mul.Tensor"(%20219, %20215) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20221 = "torch.aten.cos"(%20220) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20222 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20223 = "torch.prims.convert_element_type"(%20221, %20222) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20224 = "torch.aten.sin"(%20220) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20225 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20226 = "torch.prims.convert_element_type"(%20224, %20225) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20227 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20228 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20229 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20230 = "torch.aten.slice.Tensor"(%20223, %20227, %20228, %18481, %20229) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20230, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20232 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20233 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20234 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20235 = "torch.aten.slice.Tensor"(%20230, %20231, %20232, %20233, %20234) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20235, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20237 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20239 = "torch.aten.slice.Tensor"(%20226, %20236, %20237, %18481, %20238) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20239, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20240 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20241 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20242 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20243 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20244 = "torch.aten.slice.Tensor"(%20239, %20240, %20241, %20242, %20243) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20244, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20245 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20246 = "torch.aten.unsqueeze"(%20235, %20245) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20246, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20247 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20248 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20249 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20251 = "torch.aten.slice.Tensor"(%20246, %20247, %20248, %20249, %20250) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20251, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20252 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20253 = "torch.aten.unsqueeze"(%20251, %20252) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20253, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20254 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20255 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20256 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20257 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20258 = "torch.aten.slice.Tensor"(%20253, %20254, %20255, %20256, %20257) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20258, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20259 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20260 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20261 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20262 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20263 = "torch.prim.ListConstruct"(%20259, %20260, %20261, %20262) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20264 = "torch.aten.repeat"(%20258, %20263) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20264, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20266 = "torch.aten.unsqueeze"(%20244, %20265) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20266, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20267 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20268 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20269 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20271 = "torch.aten.slice.Tensor"(%20266, %20267, %20268, %20269, %20270) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20271, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20272 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20273 = "torch.aten.unsqueeze"(%20271, %20272) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20273, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20274 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20275 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20276 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20278 = "torch.aten.slice.Tensor"(%20273, %20274, %20275, %20276, %20277) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20278, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20279 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20280 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20281 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20282 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20283 = "torch.prim.ListConstruct"(%20279, %20280, %20281, %20282) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20284 = "torch.aten.repeat"(%20278, %20283) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20284, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20285 = "torch.aten.mul.Tensor"(%19997, %20264) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20285, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20286 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20287 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20288 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20289 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20290 = "torch.aten.slice.Tensor"(%19997, %20286, %20287, %20288, %20289) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20290, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20291 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20292 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20293 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20294 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20295 = "torch.aten.slice.Tensor"(%19997, %20291, %20292, %20293, %20294) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20295, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20296 = "torch.aten.neg"(%20295) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20296, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20297 = "torch.prim.ListConstruct"(%20296, %20290) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20298 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20299 = "torch.aten.cat"(%20297, %20298) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20299, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20300 = "torch.aten.mul.Tensor"(%20299, %20284) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20300, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20301 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20302 = "torch.aten.add.Tensor"(%20285, %20300, %20301) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20302, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20303 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20304 = "torch.aten.mul.Scalar"(%arg69, %20303) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20304, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20305 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20306 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20307 = "torch.aten.add.Scalar"(%20304, %20305, %20306) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20307, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20308 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20309 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20310 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20311 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20312 = "torch.prim.ListConstruct"(%20308, %18477, %20309, %20310, %20311) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20313 = "torch.aten.view"(%20302, %20312) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20313, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20314 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20315 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20316 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20317 = "torch.prim.ListConstruct"(%19011, %20314, %20315, %20316) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20318 = "torch.aten.view"(%20313, %20317) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20318, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20319 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %20320 = "torch.aten.view"(%20307, %20319) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%20320, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %20321 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20322 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20323 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20324 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20325 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20326 = "torch.prim.ListConstruct"(%18479, %20321, %20322, %20323, %20324, %20325) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20327 = "torch.aten.view"(%19729, %20326) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20327, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20328 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20329 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20330 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20331 = "torch.prim.ListConstruct"(%18993, %20328, %20329, %20330) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20332 = "torch.aten.view"(%20327, %20331) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20332, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20333 = "torch.prim.ListConstruct"(%20320) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %20334 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20335 = "torch.aten.index_put"(%20332, %20333, %20318, %20334) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20335, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20336 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20337 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20338 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20339 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20340 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20341 = "torch.prim.ListConstruct"(%18479, %20336, %20337, %20338, %20339, %20340) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20342 = "torch.aten.view"(%20335, %20341) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20342, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20343 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %20344 = "torch.prim.ListConstruct"(%18479, %20343) : (!torch.int, !torch.int) -> !torch.list<int>
    %20345 = "torch.aten.view"(%20342, %20344) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20345, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %20346 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20347 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20348 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20349 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20350 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20351 = "torch.prim.ListConstruct"(%18479, %20346, %20347, %20348, %20349, %20350) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20352 = "torch.aten.view"(%20345, %20351) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20352, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20353 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20354 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20355 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20356 = "torch.prim.ListConstruct"(%18993, %20353, %20354, %20355) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20357 = "torch.aten.view"(%20352, %20356) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20357, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20358 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20359 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20360 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20361 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20362 = "torch.prim.ListConstruct"(%20358, %18477, %20359, %20360, %20361) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20363 = "torch.aten.view"(%20002, %20362) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20363, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20364 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20365 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20366 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20367 = "torch.prim.ListConstruct"(%19011, %20364, %20365, %20366) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20368 = "torch.aten.view"(%20363, %20367) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20368, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20369 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20370 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20371 = "torch.aten.add.Scalar"(%20307, %20369, %20370) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20371, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20372 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %20373 = "torch.aten.view"(%20371, %20372) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%20373, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %20374 = "torch.prim.ListConstruct"(%20373) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %20375 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20376 = "torch.aten.index_put"(%20357, %20374, %20368, %20375) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20376, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20377 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20378 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20379 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20380 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20381 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20382 = "torch.prim.ListConstruct"(%18479, %20377, %20378, %20379, %20380, %20381) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20383 = "torch.aten.view"(%20376, %20382) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20383, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20384 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %20385 = "torch.prim.ListConstruct"(%18479, %20384) : (!torch.int, !torch.int) -> !torch.list<int>
    %20386 = "torch.aten.view"(%20383, %20385) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20386, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %20387 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %20388 = "torch.aten.unsqueeze"(%20302, %20387) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20388, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20389 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20390 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20391 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20392 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20393 = "torch.prim.ListConstruct"(%20389, %18481, %20390, %20391, %20392) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20394 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20395 = "torch.aten.expand"(%20388, %20393, %20394) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20395, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20396 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20397 = "torch.aten.clone"(%20395, %20396) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20397, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20398 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20399 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20400 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20401 = "torch.prim.ListConstruct"(%20398, %18481, %20399, %20400) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20402 = "torch.aten._unsafe_view"(%20397, %20401) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20402, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20403 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %20404 = "torch.aten.unsqueeze"(%20002, %20403) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20404, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20405 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20406 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20407 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20408 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20409 = "torch.prim.ListConstruct"(%20405, %18481, %20406, %20407, %20408) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20410 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20411 = "torch.aten.expand"(%20404, %20409, %20410) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20411, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20412 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20413 = "torch.aten.clone"(%20411, %20412) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20413, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20414 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20415 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20416 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20417 = "torch.prim.ListConstruct"(%20414, %18481, %20415, %20416) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20418 = "torch.aten._unsafe_view"(%20413, %20417) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20418, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20419 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20420 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20421 = "torch.aten.transpose.int"(%20152, %20419, %20420) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20421, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20422 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20423 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20424 = "torch.aten.transpose.int"(%20402, %20422, %20423) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20424, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20425 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20426 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20427 = "torch.aten.transpose.int"(%20418, %20425, %20426) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20427, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20428 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20429 = "torch.aten.squeeze.dim"(%18570, %20428) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20429, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %20430 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20431 = "torch.aten.squeeze.dim"(%20429, %20430) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20431, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %20432 = "torch_c.to_builtin_tensor"(%20421) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %20433 = "torch_c.to_builtin_tensor"(%20424) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %20434 = "torch_c.to_builtin_tensor"(%20427) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %20435 = "torch_c.to_builtin_tensor"(%20431) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %20436 = "tensor.cast"(%20435) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %20437 = "torch_c.to_builtin_tensor"(%17289) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %20438 = "util.call"(%20432, %20433, %20434, %20437, %20436) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %20439 = "torch_c.from_builtin_tensor"(%20438) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%20439, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %20440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20441 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20442 = "torch.aten.transpose.int"(%20439, %20440, %20441) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%20442, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %20443 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20444 = "torch.aten.clone"(%20442, %20443) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%20444, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %20445 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20446 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20447 = "torch.prim.ListConstruct"(%20445, %18481, %20446) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20448 = "torch.aten._unsafe_view"(%20444, %20447) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20448, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20449 = "torch.aten.div.Tensor"(%20448, %17291) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20449, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20450 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20451 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20452 = "torch.aten.clamp"(%20449, %20450, %20451) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20452, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20453 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20454 = "torch.prims.convert_element_type"(%20452, %20453) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20454, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20455 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20456 = "torch.aten.unsqueeze"(%17293, %20455) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %20457 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20458 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20459 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20460 = "torch.prim.ListConstruct"(%20457, %20458, %20459) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20461 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20462 = "torch.aten.expand"(%20456, %20460, %20461) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %20463 = "torch_c.to_builtin_tensor"(%20454) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20464 = "torch_c.to_builtin_tensor"(%20462) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %20465 = "util.call"(%20463, %20464) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %20466 = "torch_c.from_builtin_tensor"(%20465) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20466, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20467 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20468 = "torch.prims.convert_element_type"(%20466, %20467) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20468, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20469 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20470 = "torch.aten.add.Tensor"(%19896, %20468, %20469) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20470, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20471 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20472 = "torch.prims.convert_element_type"(%20470, %20471) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20472, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20473 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20474 = "torch.aten.pow.Tensor_Scalar"(%20472, %20473) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20474, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20475 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20476 = "torch.prim.ListConstruct"(%20475) : (!torch.int) -> !torch.list<int>
    %20477 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %20478 = "torch.constant.none"() : () -> !torch.none
    %20479 = "torch.aten.mean.dim"(%20474, %20476, %20477, %20478) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20479, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20480 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %20481 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20482 = "torch.aten.add.Scalar"(%20479, %20480, %20481) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20482, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20483 = "torch.aten.rsqrt"(%20482) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20483, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20484 = "torch.aten.mul.Tensor"(%20472, %20483) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20484, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20485 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20486 = "torch.prims.convert_element_type"(%20484, %20485) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20486, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20487 = "torch.aten.mul.Tensor"(%17295, %20486) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20487, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20488 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20489 = "torch.prims.convert_element_type"(%20487, %20488) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20489, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20490 = "torch.aten.div.Tensor"(%20489, %17297) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20490, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20491 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20492 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20493 = "torch.aten.clamp"(%20490, %20491, %20492) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20493, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20494 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20495 = "torch.prims.convert_element_type"(%20493, %20494) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20495, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20496 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20497 = "torch.aten.unsqueeze"(%17299, %20496) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %20498 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20499 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %20500 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20501 = "torch.prim.ListConstruct"(%20498, %20499, %20500) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20502 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20503 = "torch.aten.expand"(%20497, %20501, %20502) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %20504 = "torch_c.to_builtin_tensor"(%20495) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20505 = "torch_c.to_builtin_tensor"(%20503) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %20506 = "util.call"(%20504, %20505) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %20507 = "torch_c.from_builtin_tensor"(%20506) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%20507, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %20508 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20509 = "torch.prims.convert_element_type"(%20507, %20508) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20509, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20510 = "torch.aten.silu"(%20509) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20510, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20511 = "torch.aten.div.Tensor"(%20489, %17301) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20511, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20512 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20513 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20514 = "torch.aten.clamp"(%20511, %20512, %20513) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20514, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20515 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20516 = "torch.prims.convert_element_type"(%20514, %20515) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20516, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20518 = "torch.aten.unsqueeze"(%17303, %20517) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %20519 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20520 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %20521 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20522 = "torch.prim.ListConstruct"(%20519, %20520, %20521) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20523 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20524 = "torch.aten.expand"(%20518, %20522, %20523) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %20525 = "torch_c.to_builtin_tensor"(%20516) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20526 = "torch_c.to_builtin_tensor"(%20524) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %20527 = "util.call"(%20525, %20526) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %20528 = "torch_c.from_builtin_tensor"(%20527) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%20528, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %20529 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20530 = "torch.prims.convert_element_type"(%20528, %20529) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20530, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20531 = "torch.aten.mul.Tensor"(%20510, %20530) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20531, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20532 = "torch.aten.div.Tensor"(%20531, %17305) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20532, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20533 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20534 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20535 = "torch.aten.clamp"(%20532, %20533, %20534) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20535, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20536 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20537 = "torch.prims.convert_element_type"(%20535, %20536) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20537, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %20538 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20539 = "torch.aten.unsqueeze"(%17307, %20538) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %20540 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20541 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20542 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %20543 = "torch.prim.ListConstruct"(%20540, %20541, %20542) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20544 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20545 = "torch.aten.expand"(%20539, %20543, %20544) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %20546 = "torch_c.to_builtin_tensor"(%20537) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %20547 = "torch_c.to_builtin_tensor"(%20545) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %20548 = "util.call"(%20546, %20547) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %20549 = "torch_c.from_builtin_tensor"(%20548) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20549, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20550 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20551 = "torch.prims.convert_element_type"(%20549, %20550) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20551, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20552 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20553 = "torch.aten.add.Tensor"(%20470, %20551, %20552) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20553, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20554 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20555 = "torch.prims.convert_element_type"(%20553, %20554) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20555, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20556 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20557 = "torch.aten.pow.Tensor_Scalar"(%20555, %20556) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20557, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20558 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20559 = "torch.prim.ListConstruct"(%20558) : (!torch.int) -> !torch.list<int>
    %20560 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %20561 = "torch.constant.none"() : () -> !torch.none
    %20562 = "torch.aten.mean.dim"(%20557, %20559, %20560, %20561) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20562, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20563 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %20564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20565 = "torch.aten.add.Scalar"(%20562, %20563, %20564) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20565, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20566 = "torch.aten.rsqrt"(%20565) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20566, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20567 = "torch.aten.mul.Tensor"(%20555, %20566) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20567, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20568 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20569 = "torch.prims.convert_element_type"(%20567, %20568) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20569, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20570 = "torch.aten.mul.Tensor"(%17309, %20569) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20570, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20571 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20572 = "torch.prims.convert_element_type"(%20570, %20571) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20572, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20573 = "torch.aten.div.Tensor"(%20572, %17311) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20573, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20574 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20575 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20576 = "torch.aten.clamp"(%20573, %20574, %20575) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20576, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20577 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20578 = "torch.prims.convert_element_type"(%20576, %20577) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20578, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20580 = "torch.aten.unsqueeze"(%17313, %20579) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %20581 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20582 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20583 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20584 = "torch.prim.ListConstruct"(%20581, %20582, %20583) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20585 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20586 = "torch.aten.expand"(%20580, %20584, %20585) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %20587 = "torch_c.to_builtin_tensor"(%20578) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20588 = "torch_c.to_builtin_tensor"(%20586) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %20589 = "util.call"(%20587, %20588) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %20590 = "torch_c.from_builtin_tensor"(%20589) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20590, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20591 = "torch.aten.div.Tensor"(%20590, %17315) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20591, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20592 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20593 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20594 = "torch.aten.clamp"(%20591, %20592, %20593) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20594, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20595 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20596 = "torch.prims.convert_element_type"(%20594, %20595) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20596, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20597 = "torch.aten.div.Tensor"(%20572, %17317) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20597, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20598 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20599 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20600 = "torch.aten.clamp"(%20597, %20598, %20599) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20600, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20601 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20602 = "torch.prims.convert_element_type"(%20600, %20601) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20602, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20603 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20604 = "torch.aten.unsqueeze"(%17319, %20603) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %20605 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20606 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %20607 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20608 = "torch.prim.ListConstruct"(%20605, %20606, %20607) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20609 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20610 = "torch.aten.expand"(%20604, %20608, %20609) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %20611 = "torch_c.to_builtin_tensor"(%20602) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20612 = "torch_c.to_builtin_tensor"(%20610) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %20613 = "util.call"(%20611, %20612) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %20614 = "torch_c.from_builtin_tensor"(%20613) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20614, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20615 = "torch.aten.div.Tensor"(%20614, %17321) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20615, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20616 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20617 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20618 = "torch.aten.clamp"(%20615, %20616, %20617) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20618, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20619 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20620 = "torch.prims.convert_element_type"(%20618, %20619) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20620, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %20621 = "torch.aten.div.Tensor"(%20572, %17323) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20621, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20622 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20623 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20624 = "torch.aten.clamp"(%20621, %20622, %20623) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20624, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20625 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20626 = "torch.prims.convert_element_type"(%20624, %20625) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20626, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20627 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20628 = "torch.aten.unsqueeze"(%17325, %20627) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %20629 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20630 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %20631 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20632 = "torch.prim.ListConstruct"(%20629, %20630, %20631) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20633 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20634 = "torch.aten.expand"(%20628, %20632, %20633) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %20635 = "torch_c.to_builtin_tensor"(%20626) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20636 = "torch_c.to_builtin_tensor"(%20634) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %20637 = "util.call"(%20635, %20636) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %20638 = "torch_c.from_builtin_tensor"(%20637) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20638, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20639 = "torch.aten.div.Tensor"(%20638, %17327) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20639, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20640 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20641 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20642 = "torch.aten.clamp"(%20639, %20640, %20641) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20642, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20643 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20644 = "torch.prims.convert_element_type"(%20642, %20643) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20644, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %20645 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20646 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20647 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20648 = "torch.prim.ListConstruct"(%20645, %18481, %20646, %20647) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20649 = "torch.aten.view"(%20596, %20648) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20649, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20650 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20651 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20652 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20653 = "torch.prim.ListConstruct"(%20650, %18481, %20651, %20652) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20654 = "torch.aten.view"(%20620, %20653) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20654, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20655 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20656 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20657 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20658 = "torch.prim.ListConstruct"(%20655, %18481, %20656, %20657) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20659 = "torch.aten.view"(%20644, %20658) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20659, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20660 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20661 = "torch.constant.none"() : () -> !torch.none
    %20662 = "torch.constant.none"() : () -> !torch.none
    %20663 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20664 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20665 = "torch.aten.arange"(%20660, %20661, %20662, %20663, %20664) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20666 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20667 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20668 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20669 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20670 = "torch.constant.none"() : () -> !torch.none
    %20671 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20672 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20673 = "torch.aten.arange.start_step"(%20666, %20667, %20668, %20669, %20670, %20671, %20672) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20674 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20675 = "torch.prims.convert_element_type"(%20673, %20674) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20676 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20677 = "torch.aten.div.Scalar"(%20675, %20676) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20678 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20679 = "torch.aten.pow.Scalar"(%20678, %20677) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20680 = "torch.aten.reciprocal"(%20679) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20681 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20682 = "torch.aten.mul.Scalar"(%20680, %20681) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20683 = "torch.aten.reciprocal"(%20682) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20684 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20685 = "torch.aten.mul.Scalar"(%20683, %20684) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20686 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20687 = "torch.aten.gt.Scalar"(%20685, %20686) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20688 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20689 = "torch.aten.div.Scalar"(%20682, %20688) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20690 = "torch.aten.where.self"(%20687, %20689, %20682) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20691 = "torch.aten.reciprocal"(%20685) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20692 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20693 = "torch.aten.mul.Scalar"(%20691, %20692) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20694 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20695 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20696 = "torch.aten.sub.Scalar"(%20693, %20694, %20695) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20697 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20698 = "torch.aten.div.Scalar"(%20696, %20697) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20699 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20700 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20701 = "torch.aten.rsub.Scalar"(%20698, %20699, %20700) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20702 = "torch.aten.mul.Tensor"(%20701, %20690) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20703 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20704 = "torch.aten.div.Scalar"(%20702, %20703) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20705 = "torch.aten.mul.Tensor"(%20698, %20690) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20706 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20707 = "torch.aten.add.Tensor"(%20704, %20705, %20706) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20708 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20709 = "torch.aten.lt.Scalar"(%20685, %20708) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20710 = "torch.aten.bitwise_not"(%20709) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20711 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20712 = "torch.aten.gt.Scalar"(%20685, %20711) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20713 = "torch.aten.bitwise_not"(%20712) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20714 = "torch.aten.mul.Tensor"(%20710, %20713) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20715 = "torch.aten.where.self"(%20714, %20707, %20690) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20716 = "torch.prim.ListConstruct"(%20715, %20715) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20717 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20718 = "torch.aten.cat"(%20716, %20717) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20719 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20720 = "torch.prims.convert_element_type"(%20665, %20719) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20721 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20722 = "torch.prims.convert_element_type"(%20718, %20721) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20723 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20724 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20725 = "torch.prim.ListConstruct"(%20723, %20724) : (!torch.int, !torch.int) -> !torch.list<int>
    %20726 = "torch.aten.view"(%20720, %20725) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20727 = "torch.aten.mul.Tensor"(%20726, %20722) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20728 = "torch.aten.cos"(%20727) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20729 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20730 = "torch.prims.convert_element_type"(%20728, %20729) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20731 = "torch.aten.sin"(%20727) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20732 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20733 = "torch.prims.convert_element_type"(%20731, %20732) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20734 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20735 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20736 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20737 = "torch.aten.slice.Tensor"(%20730, %20734, %20735, %18481, %20736) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20737, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20738 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20739 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20740 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20741 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20742 = "torch.aten.slice.Tensor"(%20737, %20738, %20739, %20740, %20741) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20742, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20743 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20744 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20745 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20746 = "torch.aten.slice.Tensor"(%20733, %20743, %20744, %18481, %20745) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20746, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20747 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20748 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20749 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20751 = "torch.aten.slice.Tensor"(%20746, %20747, %20748, %20749, %20750) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20751, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20752 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20753 = "torch.aten.unsqueeze"(%20742, %20752) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20753, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20754 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20755 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20756 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20757 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20758 = "torch.aten.slice.Tensor"(%20753, %20754, %20755, %20756, %20757) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20758, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20759 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20760 = "torch.aten.unsqueeze"(%20758, %20759) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20760, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20761 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20762 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20763 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20764 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20765 = "torch.aten.slice.Tensor"(%20760, %20761, %20762, %20763, %20764) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20765, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20766 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20767 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20768 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20769 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20770 = "torch.prim.ListConstruct"(%20766, %20767, %20768, %20769) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20771 = "torch.aten.repeat"(%20765, %20770) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20771, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20772 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20773 = "torch.aten.unsqueeze"(%20751, %20772) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20773, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20774 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20775 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20776 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20778 = "torch.aten.slice.Tensor"(%20773, %20774, %20775, %20776, %20777) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20778, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20779 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20780 = "torch.aten.unsqueeze"(%20778, %20779) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20780, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20781 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20782 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20783 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20784 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20785 = "torch.aten.slice.Tensor"(%20780, %20781, %20782, %20783, %20784) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20785, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20786 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20787 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20788 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20789 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20790 = "torch.prim.ListConstruct"(%20786, %20787, %20788, %20789) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20791 = "torch.aten.repeat"(%20785, %20790) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20791, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20792 = "torch.aten.mul.Tensor"(%20649, %20771) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20792, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20793 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20794 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20795 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20796 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20797 = "torch.aten.slice.Tensor"(%20649, %20793, %20794, %20795, %20796) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20797, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20798 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20799 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20800 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20801 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20802 = "torch.aten.slice.Tensor"(%20649, %20798, %20799, %20800, %20801) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20802, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20803 = "torch.aten.neg"(%20802) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20803, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20804 = "torch.prim.ListConstruct"(%20803, %20797) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20805 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20806 = "torch.aten.cat"(%20804, %20805) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20806, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20807 = "torch.aten.mul.Tensor"(%20806, %20791) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20807, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20808 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20809 = "torch.aten.add.Tensor"(%20792, %20807, %20808) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20809, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20810 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20811 = "torch.constant.none"() : () -> !torch.none
    %20812 = "torch.constant.none"() : () -> !torch.none
    %20813 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20814 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20815 = "torch.aten.arange"(%20810, %20811, %20812, %20813, %20814) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20816 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20817 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20818 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20819 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20820 = "torch.constant.none"() : () -> !torch.none
    %20821 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20822 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20823 = "torch.aten.arange.start_step"(%20816, %20817, %20818, %20819, %20820, %20821, %20822) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20824 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20825 = "torch.prims.convert_element_type"(%20823, %20824) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20826 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20827 = "torch.aten.div.Scalar"(%20825, %20826) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20828 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20829 = "torch.aten.pow.Scalar"(%20828, %20827) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20830 = "torch.aten.reciprocal"(%20829) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20831 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20832 = "torch.aten.mul.Scalar"(%20830, %20831) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20833 = "torch.aten.reciprocal"(%20832) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20834 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20835 = "torch.aten.mul.Scalar"(%20833, %20834) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20836 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20837 = "torch.aten.gt.Scalar"(%20835, %20836) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20838 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20839 = "torch.aten.div.Scalar"(%20832, %20838) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20840 = "torch.aten.where.self"(%20837, %20839, %20832) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20841 = "torch.aten.reciprocal"(%20835) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20842 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20843 = "torch.aten.mul.Scalar"(%20841, %20842) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20844 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20845 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20846 = "torch.aten.sub.Scalar"(%20843, %20844, %20845) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20847 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20848 = "torch.aten.div.Scalar"(%20846, %20847) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20849 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20850 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20851 = "torch.aten.rsub.Scalar"(%20848, %20849, %20850) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20852 = "torch.aten.mul.Tensor"(%20851, %20840) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20853 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20854 = "torch.aten.div.Scalar"(%20852, %20853) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20855 = "torch.aten.mul.Tensor"(%20848, %20840) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20857 = "torch.aten.add.Tensor"(%20854, %20855, %20856) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20858 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20859 = "torch.aten.lt.Scalar"(%20835, %20858) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20860 = "torch.aten.bitwise_not"(%20859) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20861 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20862 = "torch.aten.gt.Scalar"(%20835, %20861) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20863 = "torch.aten.bitwise_not"(%20862) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20864 = "torch.aten.mul.Tensor"(%20860, %20863) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20865 = "torch.aten.where.self"(%20864, %20857, %20840) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20866 = "torch.prim.ListConstruct"(%20865, %20865) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20867 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20868 = "torch.aten.cat"(%20866, %20867) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20869 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20870 = "torch.prims.convert_element_type"(%20815, %20869) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20871 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20872 = "torch.prims.convert_element_type"(%20868, %20871) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20873 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20874 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20875 = "torch.prim.ListConstruct"(%20873, %20874) : (!torch.int, !torch.int) -> !torch.list<int>
    %20876 = "torch.aten.view"(%20870, %20875) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20877 = "torch.aten.mul.Tensor"(%20876, %20872) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20878 = "torch.aten.cos"(%20877) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20879 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20880 = "torch.prims.convert_element_type"(%20878, %20879) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20881 = "torch.aten.sin"(%20877) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20882 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20883 = "torch.prims.convert_element_type"(%20881, %20882) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20884 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20885 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20886 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20887 = "torch.aten.slice.Tensor"(%20880, %20884, %20885, %18481, %20886) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20887, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20889 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20890 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20891 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20892 = "torch.aten.slice.Tensor"(%20887, %20888, %20889, %20890, %20891) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20892, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20894 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20895 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20896 = "torch.aten.slice.Tensor"(%20883, %20893, %20894, %18481, %20895) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20896, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20897 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20898 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20899 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20900 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20901 = "torch.aten.slice.Tensor"(%20896, %20897, %20898, %20899, %20900) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20901, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20902 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20903 = "torch.aten.unsqueeze"(%20892, %20902) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20903, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20904 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20905 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20906 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20908 = "torch.aten.slice.Tensor"(%20903, %20904, %20905, %20906, %20907) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20908, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20909 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20910 = "torch.aten.unsqueeze"(%20908, %20909) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20910, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20911 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20913 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20915 = "torch.aten.slice.Tensor"(%20910, %20911, %20912, %20913, %20914) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20915, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20916 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20917 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20918 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20919 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20920 = "torch.prim.ListConstruct"(%20916, %20917, %20918, %20919) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20921 = "torch.aten.repeat"(%20915, %20920) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20921, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20922 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20923 = "torch.aten.unsqueeze"(%20901, %20922) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20923, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20924 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20925 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20926 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20928 = "torch.aten.slice.Tensor"(%20923, %20924, %20925, %20926, %20927) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20928, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20929 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20930 = "torch.aten.unsqueeze"(%20928, %20929) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20930, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20931 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20932 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20933 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20935 = "torch.aten.slice.Tensor"(%20930, %20931, %20932, %20933, %20934) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20935, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20936 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20937 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20938 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20939 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20940 = "torch.prim.ListConstruct"(%20936, %20937, %20938, %20939) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20941 = "torch.aten.repeat"(%20935, %20940) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20941, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20942 = "torch.aten.mul.Tensor"(%20654, %20921) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20942, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20943 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20944 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20945 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20946 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20947 = "torch.aten.slice.Tensor"(%20654, %20943, %20944, %20945, %20946) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20947, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20948 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20949 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20950 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20951 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20952 = "torch.aten.slice.Tensor"(%20654, %20948, %20949, %20950, %20951) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20952, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20953 = "torch.aten.neg"(%20952) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20953, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20954 = "torch.prim.ListConstruct"(%20953, %20947) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20955 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20956 = "torch.aten.cat"(%20954, %20955) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20956, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20957 = "torch.aten.mul.Tensor"(%20956, %20941) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20957, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20958 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20959 = "torch.aten.add.Tensor"(%20942, %20957, %20958) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20959, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20960 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20961 = "torch.aten.mul.Scalar"(%arg69, %20960) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20961, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20962 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20963 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20964 = "torch.aten.add.Scalar"(%20961, %20962, %20963) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20964, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20965 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20966 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20967 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20968 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20969 = "torch.prim.ListConstruct"(%20965, %18477, %20966, %20967, %20968) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20970 = "torch.aten.view"(%20959, %20969) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20970, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20971 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20972 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20973 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20974 = "torch.prim.ListConstruct"(%19011, %20971, %20972, %20973) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20975 = "torch.aten.view"(%20970, %20974) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20975, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20976 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %20977 = "torch.aten.view"(%20964, %20976) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%20977, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %20978 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20979 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20980 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20981 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20982 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20983 = "torch.prim.ListConstruct"(%18479, %20978, %20979, %20980, %20981, %20982) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20984 = "torch.aten.view"(%20386, %20983) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20984, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20985 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20986 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20987 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20988 = "torch.prim.ListConstruct"(%18993, %20985, %20986, %20987) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20989 = "torch.aten.view"(%20984, %20988) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20989, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20990 = "torch.prim.ListConstruct"(%20977) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %20991 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20992 = "torch.aten.index_put"(%20989, %20990, %20975, %20991) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20992, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20993 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20994 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20995 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20996 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20997 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20998 = "torch.prim.ListConstruct"(%18479, %20993, %20994, %20995, %20996, %20997) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20999 = "torch.aten.view"(%20992, %20998) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20999, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21000 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21001 = "torch.prim.ListConstruct"(%18479, %21000) : (!torch.int, !torch.int) -> !torch.list<int>
    %21002 = "torch.aten.view"(%20999, %21001) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21002, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21003 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21004 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21005 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21006 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21007 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21008 = "torch.prim.ListConstruct"(%18479, %21003, %21004, %21005, %21006, %21007) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21009 = "torch.aten.view"(%21002, %21008) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21009, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21010 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21011 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21012 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21013 = "torch.prim.ListConstruct"(%18993, %21010, %21011, %21012) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21014 = "torch.aten.view"(%21009, %21013) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21014, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21015 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21016 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21017 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21018 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21019 = "torch.prim.ListConstruct"(%21015, %18477, %21016, %21017, %21018) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21020 = "torch.aten.view"(%20659, %21019) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21020, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21021 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21022 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21023 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21024 = "torch.prim.ListConstruct"(%19011, %21021, %21022, %21023) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21025 = "torch.aten.view"(%21020, %21024) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21025, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21026 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21027 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21028 = "torch.aten.add.Scalar"(%20964, %21026, %21027) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21028, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21029 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %21030 = "torch.aten.view"(%21028, %21029) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%21030, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %21031 = "torch.prim.ListConstruct"(%21030) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %21032 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21033 = "torch.aten.index_put"(%21014, %21031, %21025, %21032) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21033, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21034 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21035 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21036 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21037 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21038 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21039 = "torch.prim.ListConstruct"(%18479, %21034, %21035, %21036, %21037, %21038) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21040 = "torch.aten.view"(%21033, %21039) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21040, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21041 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21042 = "torch.prim.ListConstruct"(%18479, %21041) : (!torch.int, !torch.int) -> !torch.list<int>
    %21043 = "torch.aten.view"(%21040, %21042) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21043, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21044 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21045 = "torch.aten.unsqueeze"(%20959, %21044) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21045, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21046 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21047 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21048 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21049 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21050 = "torch.prim.ListConstruct"(%21046, %18481, %21047, %21048, %21049) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21051 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21052 = "torch.aten.expand"(%21045, %21050, %21051) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21052, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21053 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21054 = "torch.aten.clone"(%21052, %21053) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21054, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21055 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21056 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21057 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21058 = "torch.prim.ListConstruct"(%21055, %18481, %21056, %21057) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21059 = "torch.aten._unsafe_view"(%21054, %21058) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21059, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21060 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21061 = "torch.aten.unsqueeze"(%20659, %21060) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21061, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21062 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21063 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21064 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21065 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21066 = "torch.prim.ListConstruct"(%21062, %18481, %21063, %21064, %21065) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21067 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21068 = "torch.aten.expand"(%21061, %21066, %21067) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21068, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21069 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21070 = "torch.aten.clone"(%21068, %21069) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21070, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21071 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21072 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21073 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21074 = "torch.prim.ListConstruct"(%21071, %18481, %21072, %21073) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21075 = "torch.aten._unsafe_view"(%21070, %21074) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21075, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21076 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21077 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21078 = "torch.aten.transpose.int"(%20809, %21076, %21077) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21078, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21079 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21080 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21081 = "torch.aten.transpose.int"(%21059, %21079, %21080) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21081, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21082 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21083 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21084 = "torch.aten.transpose.int"(%21075, %21082, %21083) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21084, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21085 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21086 = "torch.aten.squeeze.dim"(%18570, %21085) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21086, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21087 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21088 = "torch.aten.squeeze.dim"(%21086, %21087) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21088, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21089 = "torch_c.to_builtin_tensor"(%21078) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21090 = "torch_c.to_builtin_tensor"(%21081) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21091 = "torch_c.to_builtin_tensor"(%21084) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21092 = "torch_c.to_builtin_tensor"(%21088) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %21093 = "tensor.cast"(%21092) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %21094 = "torch_c.to_builtin_tensor"(%17329) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %21095 = "util.call"(%21089, %21090, %21091, %21094, %21093) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %21096 = "torch_c.from_builtin_tensor"(%21095) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%21096, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %21097 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21098 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21099 = "torch.aten.transpose.int"(%21096, %21097, %21098) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21099, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21100 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21101 = "torch.aten.clone"(%21099, %21100) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21101, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21102 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21103 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21104 = "torch.prim.ListConstruct"(%21102, %18481, %21103) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21105 = "torch.aten._unsafe_view"(%21101, %21104) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21105, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21106 = "torch.aten.div.Tensor"(%21105, %17331) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21106, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21107 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21108 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21109 = "torch.aten.clamp"(%21106, %21107, %21108) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21109, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21110 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21111 = "torch.prims.convert_element_type"(%21109, %21110) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21111, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21112 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21113 = "torch.aten.unsqueeze"(%17333, %21112) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21114 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21115 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21116 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21117 = "torch.prim.ListConstruct"(%21114, %21115, %21116) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21118 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21119 = "torch.aten.expand"(%21113, %21117, %21118) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21120 = "torch_c.to_builtin_tensor"(%21111) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21121 = "torch_c.to_builtin_tensor"(%21119) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21122 = "util.call"(%21120, %21121) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21123 = "torch_c.from_builtin_tensor"(%21122) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21123, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21124 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21125 = "torch.prims.convert_element_type"(%21123, %21124) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21125, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21126 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21127 = "torch.aten.add.Tensor"(%20553, %21125, %21126) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21127, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21128 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21129 = "torch.prims.convert_element_type"(%21127, %21128) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21129, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21130 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21131 = "torch.aten.pow.Tensor_Scalar"(%21129, %21130) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21131, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21132 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21133 = "torch.prim.ListConstruct"(%21132) : (!torch.int) -> !torch.list<int>
    %21134 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21135 = "torch.constant.none"() : () -> !torch.none
    %21136 = "torch.aten.mean.dim"(%21131, %21133, %21134, %21135) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21136, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21137 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21138 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21139 = "torch.aten.add.Scalar"(%21136, %21137, %21138) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21139, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21140 = "torch.aten.rsqrt"(%21139) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21140, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21141 = "torch.aten.mul.Tensor"(%21129, %21140) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21141, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21142 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21143 = "torch.prims.convert_element_type"(%21141, %21142) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21143, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21144 = "torch.aten.mul.Tensor"(%17335, %21143) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21144, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21145 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21146 = "torch.prims.convert_element_type"(%21144, %21145) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21146, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21147 = "torch.aten.div.Tensor"(%21146, %17337) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21147, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21148 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21149 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21150 = "torch.aten.clamp"(%21147, %21148, %21149) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21150, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21151 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21152 = "torch.prims.convert_element_type"(%21150, %21151) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21152, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21153 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21154 = "torch.aten.unsqueeze"(%17339, %21153) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21155 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21156 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21157 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21158 = "torch.prim.ListConstruct"(%21155, %21156, %21157) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21159 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21160 = "torch.aten.expand"(%21154, %21158, %21159) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21161 = "torch_c.to_builtin_tensor"(%21152) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21162 = "torch_c.to_builtin_tensor"(%21160) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21163 = "util.call"(%21161, %21162) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21164 = "torch_c.from_builtin_tensor"(%21163) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21164, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21165 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21166 = "torch.prims.convert_element_type"(%21164, %21165) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21166, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21167 = "torch.aten.silu"(%21166) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21167, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21168 = "torch.aten.div.Tensor"(%21146, %17341) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21168, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21169 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21170 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21171 = "torch.aten.clamp"(%21168, %21169, %21170) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21171, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21172 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21173 = "torch.prims.convert_element_type"(%21171, %21172) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21173, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21174 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21175 = "torch.aten.unsqueeze"(%17343, %21174) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21176 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21177 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21178 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21179 = "torch.prim.ListConstruct"(%21176, %21177, %21178) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21180 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21181 = "torch.aten.expand"(%21175, %21179, %21180) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21182 = "torch_c.to_builtin_tensor"(%21173) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21183 = "torch_c.to_builtin_tensor"(%21181) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21184 = "util.call"(%21182, %21183) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21185 = "torch_c.from_builtin_tensor"(%21184) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21185, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21186 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21187 = "torch.prims.convert_element_type"(%21185, %21186) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21187, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21188 = "torch.aten.mul.Tensor"(%21167, %21187) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21188, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21189 = "torch.aten.div.Tensor"(%21188, %17345) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21189, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21190 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21191 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21192 = "torch.aten.clamp"(%21189, %21190, %21191) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21192, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21193 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21194 = "torch.prims.convert_element_type"(%21192, %21193) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21194, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %21195 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21196 = "torch.aten.unsqueeze"(%17347, %21195) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %21197 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21198 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21199 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21200 = "torch.prim.ListConstruct"(%21197, %21198, %21199) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21201 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21202 = "torch.aten.expand"(%21196, %21200, %21201) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %21203 = "torch_c.to_builtin_tensor"(%21194) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %21204 = "torch_c.to_builtin_tensor"(%21202) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %21205 = "util.call"(%21203, %21204) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21206 = "torch_c.from_builtin_tensor"(%21205) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21206, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21207 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21208 = "torch.prims.convert_element_type"(%21206, %21207) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21208, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21210 = "torch.aten.add.Tensor"(%21127, %21208, %21209) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21210, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21211 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21212 = "torch.prims.convert_element_type"(%21210, %21211) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21212, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21213 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21214 = "torch.aten.pow.Tensor_Scalar"(%21212, %21213) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21214, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21215 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21216 = "torch.prim.ListConstruct"(%21215) : (!torch.int) -> !torch.list<int>
    %21217 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21218 = "torch.constant.none"() : () -> !torch.none
    %21219 = "torch.aten.mean.dim"(%21214, %21216, %21217, %21218) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21219, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21220 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21222 = "torch.aten.add.Scalar"(%21219, %21220, %21221) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21222, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21223 = "torch.aten.rsqrt"(%21222) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21223, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21224 = "torch.aten.mul.Tensor"(%21212, %21223) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21224, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21225 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21226 = "torch.prims.convert_element_type"(%21224, %21225) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21226, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21227 = "torch.aten.mul.Tensor"(%17349, %21226) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21227, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21228 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21229 = "torch.prims.convert_element_type"(%21227, %21228) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21229, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21230 = "torch.aten.div.Tensor"(%21229, %17351) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21230, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21231 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21232 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21233 = "torch.aten.clamp"(%21230, %21231, %21232) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21233, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21234 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21235 = "torch.prims.convert_element_type"(%21233, %21234) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21235, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21237 = "torch.aten.unsqueeze"(%17353, %21236) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21238 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21239 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21240 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21241 = "torch.prim.ListConstruct"(%21238, %21239, %21240) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21242 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21243 = "torch.aten.expand"(%21237, %21241, %21242) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21244 = "torch_c.to_builtin_tensor"(%21235) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21245 = "torch_c.to_builtin_tensor"(%21243) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21246 = "util.call"(%21244, %21245) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21247 = "torch_c.from_builtin_tensor"(%21246) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21247, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21248 = "torch.aten.div.Tensor"(%21247, %17355) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21248, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21249 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21250 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21251 = "torch.aten.clamp"(%21248, %21249, %21250) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21251, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21252 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21253 = "torch.prims.convert_element_type"(%21251, %21252) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21253, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21254 = "torch.aten.div.Tensor"(%21229, %17357) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21254, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21255 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21256 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21257 = "torch.aten.clamp"(%21254, %21255, %21256) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21257, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21258 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21259 = "torch.prims.convert_element_type"(%21257, %21258) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21259, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21260 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21261 = "torch.aten.unsqueeze"(%17359, %21260) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21262 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21263 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21264 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21265 = "torch.prim.ListConstruct"(%21262, %21263, %21264) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21266 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21267 = "torch.aten.expand"(%21261, %21265, %21266) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21268 = "torch_c.to_builtin_tensor"(%21259) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21269 = "torch_c.to_builtin_tensor"(%21267) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21270 = "util.call"(%21268, %21269) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21271 = "torch_c.from_builtin_tensor"(%21270) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21271, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21272 = "torch.aten.div.Tensor"(%21271, %17361) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21272, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21273 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21274 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21275 = "torch.aten.clamp"(%21272, %21273, %21274) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21275, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21276 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21277 = "torch.prims.convert_element_type"(%21275, %21276) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21277, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21278 = "torch.aten.div.Tensor"(%21229, %17363) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21278, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21279 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21280 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21281 = "torch.aten.clamp"(%21278, %21279, %21280) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21281, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21282 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21283 = "torch.prims.convert_element_type"(%21281, %21282) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21283, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21284 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21285 = "torch.aten.unsqueeze"(%17365, %21284) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21286 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21287 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21288 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21289 = "torch.prim.ListConstruct"(%21286, %21287, %21288) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21290 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21291 = "torch.aten.expand"(%21285, %21289, %21290) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21292 = "torch_c.to_builtin_tensor"(%21283) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21293 = "torch_c.to_builtin_tensor"(%21291) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21294 = "util.call"(%21292, %21293) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21295 = "torch_c.from_builtin_tensor"(%21294) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21295, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21296 = "torch.aten.div.Tensor"(%21295, %17367) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21296, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21297 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21298 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21299 = "torch.aten.clamp"(%21296, %21297, %21298) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21299, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21300 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21301 = "torch.prims.convert_element_type"(%21299, %21300) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21301, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21302 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21303 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21304 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21305 = "torch.prim.ListConstruct"(%21302, %18481, %21303, %21304) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21306 = "torch.aten.view"(%21253, %21305) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21306, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21307 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21308 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21309 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21310 = "torch.prim.ListConstruct"(%21307, %18481, %21308, %21309) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21311 = "torch.aten.view"(%21277, %21310) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21311, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21312 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21313 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21314 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21315 = "torch.prim.ListConstruct"(%21312, %18481, %21313, %21314) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21316 = "torch.aten.view"(%21301, %21315) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21316, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21317 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21318 = "torch.constant.none"() : () -> !torch.none
    %21319 = "torch.constant.none"() : () -> !torch.none
    %21320 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21321 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21322 = "torch.aten.arange"(%21317, %21318, %21319, %21320, %21321) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %21323 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21324 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21325 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21326 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21327 = "torch.constant.none"() : () -> !torch.none
    %21328 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21329 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21330 = "torch.aten.arange.start_step"(%21323, %21324, %21325, %21326, %21327, %21328, %21329) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %21331 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21332 = "torch.prims.convert_element_type"(%21330, %21331) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %21333 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21334 = "torch.aten.div.Scalar"(%21332, %21333) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21335 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %21336 = "torch.aten.pow.Scalar"(%21335, %21334) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21337 = "torch.aten.reciprocal"(%21336) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21338 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %21339 = "torch.aten.mul.Scalar"(%21337, %21338) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21340 = "torch.aten.reciprocal"(%21339) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21341 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %21342 = "torch.aten.mul.Scalar"(%21340, %21341) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21343 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21344 = "torch.aten.gt.Scalar"(%21342, %21343) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21345 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21346 = "torch.aten.div.Scalar"(%21339, %21345) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21347 = "torch.aten.where.self"(%21344, %21346, %21339) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21348 = "torch.aten.reciprocal"(%21342) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21349 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %21350 = "torch.aten.mul.Scalar"(%21348, %21349) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21351 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21352 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21353 = "torch.aten.sub.Scalar"(%21350, %21351, %21352) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21354 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21355 = "torch.aten.div.Scalar"(%21353, %21354) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21356 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21357 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21358 = "torch.aten.rsub.Scalar"(%21355, %21356, %21357) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21359 = "torch.aten.mul.Tensor"(%21358, %21347) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21360 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21361 = "torch.aten.div.Scalar"(%21359, %21360) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21362 = "torch.aten.mul.Tensor"(%21355, %21347) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21363 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21364 = "torch.aten.add.Tensor"(%21361, %21362, %21363) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21365 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %21366 = "torch.aten.lt.Scalar"(%21342, %21365) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21367 = "torch.aten.bitwise_not"(%21366) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21368 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21369 = "torch.aten.gt.Scalar"(%21342, %21368) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21370 = "torch.aten.bitwise_not"(%21369) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21371 = "torch.aten.mul.Tensor"(%21367, %21370) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21372 = "torch.aten.where.self"(%21371, %21364, %21347) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21373 = "torch.prim.ListConstruct"(%21372, %21372) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %21374 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21375 = "torch.aten.cat"(%21373, %21374) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %21376 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21377 = "torch.prims.convert_element_type"(%21322, %21376) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %21378 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21379 = "torch.prims.convert_element_type"(%21375, %21378) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %21380 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21381 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21382 = "torch.prim.ListConstruct"(%21380, %21381) : (!torch.int, !torch.int) -> !torch.list<int>
    %21383 = "torch.aten.view"(%21377, %21382) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %21384 = "torch.aten.mul.Tensor"(%21383, %21379) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21385 = "torch.aten.cos"(%21384) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21386 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21387 = "torch.prims.convert_element_type"(%21385, %21386) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21388 = "torch.aten.sin"(%21384) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21389 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21390 = "torch.prims.convert_element_type"(%21388, %21389) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21391 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21392 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21393 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21394 = "torch.aten.slice.Tensor"(%21387, %21391, %21392, %18481, %21393) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21394, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21395 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21396 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21397 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21398 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21399 = "torch.aten.slice.Tensor"(%21394, %21395, %21396, %21397, %21398) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21399, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21400 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21401 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21402 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21403 = "torch.aten.slice.Tensor"(%21390, %21400, %21401, %18481, %21402) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21403, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21404 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21405 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21406 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21408 = "torch.aten.slice.Tensor"(%21403, %21404, %21405, %21406, %21407) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21408, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21409 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21410 = "torch.aten.unsqueeze"(%21399, %21409) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21410, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21411 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21412 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21413 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21414 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21415 = "torch.aten.slice.Tensor"(%21410, %21411, %21412, %21413, %21414) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21415, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21416 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21417 = "torch.aten.unsqueeze"(%21415, %21416) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21417, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21418 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21419 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21420 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21421 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21422 = "torch.aten.slice.Tensor"(%21417, %21418, %21419, %21420, %21421) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21422, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21423 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21424 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21425 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21426 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21427 = "torch.prim.ListConstruct"(%21423, %21424, %21425, %21426) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21428 = "torch.aten.repeat"(%21422, %21427) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21428, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21429 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21430 = "torch.aten.unsqueeze"(%21408, %21429) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21430, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21431 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21432 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21433 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21434 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21435 = "torch.aten.slice.Tensor"(%21430, %21431, %21432, %21433, %21434) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21435, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21436 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21437 = "torch.aten.unsqueeze"(%21435, %21436) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21437, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21438 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21439 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21440 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21441 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21442 = "torch.aten.slice.Tensor"(%21437, %21438, %21439, %21440, %21441) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21442, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21443 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21444 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21445 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21446 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21447 = "torch.prim.ListConstruct"(%21443, %21444, %21445, %21446) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21448 = "torch.aten.repeat"(%21442, %21447) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21448, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21449 = "torch.aten.mul.Tensor"(%21306, %21428) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21449, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21450 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21451 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21452 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21453 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21454 = "torch.aten.slice.Tensor"(%21306, %21450, %21451, %21452, %21453) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21454, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21455 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21456 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21457 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21458 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21459 = "torch.aten.slice.Tensor"(%21306, %21455, %21456, %21457, %21458) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21459, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21460 = "torch.aten.neg"(%21459) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21460, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21461 = "torch.prim.ListConstruct"(%21460, %21454) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %21462 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21463 = "torch.aten.cat"(%21461, %21462) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21463, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21464 = "torch.aten.mul.Tensor"(%21463, %21448) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21464, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21465 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21466 = "torch.aten.add.Tensor"(%21449, %21464, %21465) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21466, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21467 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21468 = "torch.constant.none"() : () -> !torch.none
    %21469 = "torch.constant.none"() : () -> !torch.none
    %21470 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21471 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21472 = "torch.aten.arange"(%21467, %21468, %21469, %21470, %21471) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %21473 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21474 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21475 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21476 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21477 = "torch.constant.none"() : () -> !torch.none
    %21478 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21479 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21480 = "torch.aten.arange.start_step"(%21473, %21474, %21475, %21476, %21477, %21478, %21479) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %21481 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21482 = "torch.prims.convert_element_type"(%21480, %21481) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %21483 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21484 = "torch.aten.div.Scalar"(%21482, %21483) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21485 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %21486 = "torch.aten.pow.Scalar"(%21485, %21484) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21487 = "torch.aten.reciprocal"(%21486) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21488 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %21489 = "torch.aten.mul.Scalar"(%21487, %21488) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21490 = "torch.aten.reciprocal"(%21489) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21491 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %21492 = "torch.aten.mul.Scalar"(%21490, %21491) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21493 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21494 = "torch.aten.gt.Scalar"(%21492, %21493) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21495 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21496 = "torch.aten.div.Scalar"(%21489, %21495) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21497 = "torch.aten.where.self"(%21494, %21496, %21489) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21498 = "torch.aten.reciprocal"(%21492) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21499 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %21500 = "torch.aten.mul.Scalar"(%21498, %21499) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21501 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21502 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21503 = "torch.aten.sub.Scalar"(%21500, %21501, %21502) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21504 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21505 = "torch.aten.div.Scalar"(%21503, %21504) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21507 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21508 = "torch.aten.rsub.Scalar"(%21505, %21506, %21507) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21509 = "torch.aten.mul.Tensor"(%21508, %21497) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21510 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21511 = "torch.aten.div.Scalar"(%21509, %21510) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21512 = "torch.aten.mul.Tensor"(%21505, %21497) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21513 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21514 = "torch.aten.add.Tensor"(%21511, %21512, %21513) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21515 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %21516 = "torch.aten.lt.Scalar"(%21492, %21515) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21517 = "torch.aten.bitwise_not"(%21516) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21518 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21519 = "torch.aten.gt.Scalar"(%21492, %21518) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21520 = "torch.aten.bitwise_not"(%21519) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21521 = "torch.aten.mul.Tensor"(%21517, %21520) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21522 = "torch.aten.where.self"(%21521, %21514, %21497) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21523 = "torch.prim.ListConstruct"(%21522, %21522) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %21524 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21525 = "torch.aten.cat"(%21523, %21524) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %21526 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21527 = "torch.prims.convert_element_type"(%21472, %21526) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %21528 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21529 = "torch.prims.convert_element_type"(%21525, %21528) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %21530 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21531 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21532 = "torch.prim.ListConstruct"(%21530, %21531) : (!torch.int, !torch.int) -> !torch.list<int>
    %21533 = "torch.aten.view"(%21527, %21532) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %21534 = "torch.aten.mul.Tensor"(%21533, %21529) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21535 = "torch.aten.cos"(%21534) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21536 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21537 = "torch.prims.convert_element_type"(%21535, %21536) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21538 = "torch.aten.sin"(%21534) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21539 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21540 = "torch.prims.convert_element_type"(%21538, %21539) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21541 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21542 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21543 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21544 = "torch.aten.slice.Tensor"(%21537, %21541, %21542, %18481, %21543) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21544, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21545 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21546 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21547 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21548 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21549 = "torch.aten.slice.Tensor"(%21544, %21545, %21546, %21547, %21548) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21549, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21550 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21551 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21552 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21553 = "torch.aten.slice.Tensor"(%21540, %21550, %21551, %18481, %21552) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21553, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21554 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21555 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21556 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21557 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21558 = "torch.aten.slice.Tensor"(%21553, %21554, %21555, %21556, %21557) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21558, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21559 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21560 = "torch.aten.unsqueeze"(%21549, %21559) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21560, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21561 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21562 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21563 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21565 = "torch.aten.slice.Tensor"(%21560, %21561, %21562, %21563, %21564) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21565, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21566 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21567 = "torch.aten.unsqueeze"(%21565, %21566) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21567, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21568 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21569 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21570 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21571 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21572 = "torch.aten.slice.Tensor"(%21567, %21568, %21569, %21570, %21571) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21572, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21573 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21574 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21575 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21576 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21577 = "torch.prim.ListConstruct"(%21573, %21574, %21575, %21576) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21578 = "torch.aten.repeat"(%21572, %21577) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21578, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21580 = "torch.aten.unsqueeze"(%21558, %21579) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21580, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21581 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21582 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21583 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21584 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21585 = "torch.aten.slice.Tensor"(%21580, %21581, %21582, %21583, %21584) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21585, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21586 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21587 = "torch.aten.unsqueeze"(%21585, %21586) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21587, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21588 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21589 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21590 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21591 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21592 = "torch.aten.slice.Tensor"(%21587, %21588, %21589, %21590, %21591) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21592, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21593 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21594 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21595 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21596 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21597 = "torch.prim.ListConstruct"(%21593, %21594, %21595, %21596) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21598 = "torch.aten.repeat"(%21592, %21597) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21598, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21599 = "torch.aten.mul.Tensor"(%21311, %21578) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21599, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21600 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21601 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21602 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21603 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21604 = "torch.aten.slice.Tensor"(%21311, %21600, %21601, %21602, %21603) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21604, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21605 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21606 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21607 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21608 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21609 = "torch.aten.slice.Tensor"(%21311, %21605, %21606, %21607, %21608) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21609, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21610 = "torch.aten.neg"(%21609) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21610, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21611 = "torch.prim.ListConstruct"(%21610, %21604) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %21612 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21613 = "torch.aten.cat"(%21611, %21612) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21613, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21614 = "torch.aten.mul.Tensor"(%21613, %21598) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21614, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21615 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21616 = "torch.aten.add.Tensor"(%21599, %21614, %21615) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21616, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21617 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21618 = "torch.aten.mul.Scalar"(%arg69, %21617) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21618, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21619 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21621 = "torch.aten.add.Scalar"(%21618, %21619, %21620) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21621, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21622 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21623 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21624 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21625 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21626 = "torch.prim.ListConstruct"(%21622, %18477, %21623, %21624, %21625) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21627 = "torch.aten.view"(%21616, %21626) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21627, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21628 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21629 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21630 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21631 = "torch.prim.ListConstruct"(%19011, %21628, %21629, %21630) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21632 = "torch.aten.view"(%21627, %21631) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21632, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21633 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %21634 = "torch.aten.view"(%21621, %21633) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%21634, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %21635 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21636 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21637 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21638 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21639 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21640 = "torch.prim.ListConstruct"(%18479, %21635, %21636, %21637, %21638, %21639) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21641 = "torch.aten.view"(%21043, %21640) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21641, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21642 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21643 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21644 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21645 = "torch.prim.ListConstruct"(%18993, %21642, %21643, %21644) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21646 = "torch.aten.view"(%21641, %21645) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21646, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21647 = "torch.prim.ListConstruct"(%21634) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %21648 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21649 = "torch.aten.index_put"(%21646, %21647, %21632, %21648) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21649, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21650 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21651 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21652 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21653 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21654 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21655 = "torch.prim.ListConstruct"(%18479, %21650, %21651, %21652, %21653, %21654) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21656 = "torch.aten.view"(%21649, %21655) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21656, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21657 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21658 = "torch.prim.ListConstruct"(%18479, %21657) : (!torch.int, !torch.int) -> !torch.list<int>
    %21659 = "torch.aten.view"(%21656, %21658) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21659, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21660 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21661 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21662 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21663 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21664 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21665 = "torch.prim.ListConstruct"(%18479, %21660, %21661, %21662, %21663, %21664) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21666 = "torch.aten.view"(%21659, %21665) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21666, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21667 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21668 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21669 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21670 = "torch.prim.ListConstruct"(%18993, %21667, %21668, %21669) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21671 = "torch.aten.view"(%21666, %21670) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21671, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21672 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21673 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21674 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21675 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21676 = "torch.prim.ListConstruct"(%21672, %18477, %21673, %21674, %21675) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21677 = "torch.aten.view"(%21316, %21676) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21677, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21678 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21679 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21680 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21681 = "torch.prim.ListConstruct"(%19011, %21678, %21679, %21680) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21682 = "torch.aten.view"(%21677, %21681) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21682, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21683 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21684 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21685 = "torch.aten.add.Scalar"(%21621, %21683, %21684) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21685, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21686 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %21687 = "torch.aten.view"(%21685, %21686) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%21687, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %21688 = "torch.prim.ListConstruct"(%21687) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %21689 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21690 = "torch.aten.index_put"(%21671, %21688, %21682, %21689) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21690, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21691 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21692 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21693 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21694 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21695 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21696 = "torch.prim.ListConstruct"(%18479, %21691, %21692, %21693, %21694, %21695) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21697 = "torch.aten.view"(%21690, %21696) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21697, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21698 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21699 = "torch.prim.ListConstruct"(%18479, %21698) : (!torch.int, !torch.int) -> !torch.list<int>
    %21700 = "torch.aten.view"(%21697, %21699) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21700, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21701 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21702 = "torch.aten.unsqueeze"(%21616, %21701) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21702, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21703 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21704 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21705 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21706 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21707 = "torch.prim.ListConstruct"(%21703, %18481, %21704, %21705, %21706) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21708 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21709 = "torch.aten.expand"(%21702, %21707, %21708) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21709, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21710 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21711 = "torch.aten.clone"(%21709, %21710) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21711, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21712 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21713 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21714 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21715 = "torch.prim.ListConstruct"(%21712, %18481, %21713, %21714) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21716 = "torch.aten._unsafe_view"(%21711, %21715) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21716, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21717 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21718 = "torch.aten.unsqueeze"(%21316, %21717) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21718, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21719 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21720 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21721 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21722 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21723 = "torch.prim.ListConstruct"(%21719, %18481, %21720, %21721, %21722) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21724 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21725 = "torch.aten.expand"(%21718, %21723, %21724) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21725, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21726 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21727 = "torch.aten.clone"(%21725, %21726) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21727, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21728 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21729 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21730 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21731 = "torch.prim.ListConstruct"(%21728, %18481, %21729, %21730) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21732 = "torch.aten._unsafe_view"(%21727, %21731) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21732, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21733 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21734 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21735 = "torch.aten.transpose.int"(%21466, %21733, %21734) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21735, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21736 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21737 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21738 = "torch.aten.transpose.int"(%21716, %21736, %21737) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21738, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21739 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21740 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21741 = "torch.aten.transpose.int"(%21732, %21739, %21740) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21741, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21742 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21743 = "torch.aten.squeeze.dim"(%18570, %21742) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21743, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21744 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21745 = "torch.aten.squeeze.dim"(%21743, %21744) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21745, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21746 = "torch_c.to_builtin_tensor"(%21735) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21747 = "torch_c.to_builtin_tensor"(%21738) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21748 = "torch_c.to_builtin_tensor"(%21741) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21749 = "torch_c.to_builtin_tensor"(%21745) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %21750 = "tensor.cast"(%21749) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %21751 = "torch_c.to_builtin_tensor"(%17369) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %21752 = "util.call"(%21746, %21747, %21748, %21751, %21750) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %21753 = "torch_c.from_builtin_tensor"(%21752) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%21753, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %21754 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21755 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21756 = "torch.aten.transpose.int"(%21753, %21754, %21755) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21756, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21757 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21758 = "torch.aten.clone"(%21756, %21757) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21758, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21759 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21760 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21761 = "torch.prim.ListConstruct"(%21759, %18481, %21760) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21762 = "torch.aten._unsafe_view"(%21758, %21761) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21762, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21763 = "torch.aten.div.Tensor"(%21762, %17371) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21763, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21764 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21765 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21766 = "torch.aten.clamp"(%21763, %21764, %21765) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21766, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21767 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21768 = "torch.prims.convert_element_type"(%21766, %21767) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21768, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21769 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21770 = "torch.aten.unsqueeze"(%17373, %21769) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21771 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21772 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21773 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21774 = "torch.prim.ListConstruct"(%21771, %21772, %21773) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21775 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21776 = "torch.aten.expand"(%21770, %21774, %21775) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21777 = "torch_c.to_builtin_tensor"(%21768) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21778 = "torch_c.to_builtin_tensor"(%21776) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21779 = "util.call"(%21777, %21778) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21780 = "torch_c.from_builtin_tensor"(%21779) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21780, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21781 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21782 = "torch.prims.convert_element_type"(%21780, %21781) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21782, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21783 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21784 = "torch.aten.add.Tensor"(%21210, %21782, %21783) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21784, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21785 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21786 = "torch.prims.convert_element_type"(%21784, %21785) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21786, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21787 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21788 = "torch.aten.pow.Tensor_Scalar"(%21786, %21787) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21788, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21789 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21790 = "torch.prim.ListConstruct"(%21789) : (!torch.int) -> !torch.list<int>
    %21791 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21792 = "torch.constant.none"() : () -> !torch.none
    %21793 = "torch.aten.mean.dim"(%21788, %21790, %21791, %21792) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21793, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21794 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21795 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21796 = "torch.aten.add.Scalar"(%21793, %21794, %21795) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21796, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21797 = "torch.aten.rsqrt"(%21796) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21797, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21798 = "torch.aten.mul.Tensor"(%21786, %21797) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21798, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21799 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21800 = "torch.prims.convert_element_type"(%21798, %21799) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21800, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21801 = "torch.aten.mul.Tensor"(%17375, %21800) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21801, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21802 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21803 = "torch.prims.convert_element_type"(%21801, %21802) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21803, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21804 = "torch.aten.div.Tensor"(%21803, %17377) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21804, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21805 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21806 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21807 = "torch.aten.clamp"(%21804, %21805, %21806) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21807, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21808 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21809 = "torch.prims.convert_element_type"(%21807, %21808) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21809, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21810 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21811 = "torch.aten.unsqueeze"(%17379, %21810) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21812 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21813 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21814 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21815 = "torch.prim.ListConstruct"(%21812, %21813, %21814) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21816 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21817 = "torch.aten.expand"(%21811, %21815, %21816) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21818 = "torch_c.to_builtin_tensor"(%21809) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21819 = "torch_c.to_builtin_tensor"(%21817) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21820 = "util.call"(%21818, %21819) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21821 = "torch_c.from_builtin_tensor"(%21820) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21821, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21822 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21823 = "torch.prims.convert_element_type"(%21821, %21822) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21823, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21824 = "torch.aten.silu"(%21823) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21824, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21825 = "torch.aten.div.Tensor"(%21803, %17381) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21825, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21826 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21827 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21828 = "torch.aten.clamp"(%21825, %21826, %21827) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21828, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21829 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21830 = "torch.prims.convert_element_type"(%21828, %21829) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21830, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21831 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21832 = "torch.aten.unsqueeze"(%17383, %21831) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21833 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21834 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21835 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21836 = "torch.prim.ListConstruct"(%21833, %21834, %21835) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21837 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21838 = "torch.aten.expand"(%21832, %21836, %21837) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21839 = "torch_c.to_builtin_tensor"(%21830) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21840 = "torch_c.to_builtin_tensor"(%21838) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21841 = "util.call"(%21839, %21840) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21842 = "torch_c.from_builtin_tensor"(%21841) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21842, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21843 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21844 = "torch.prims.convert_element_type"(%21842, %21843) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21844, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21845 = "torch.aten.mul.Tensor"(%21824, %21844) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21845, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21846 = "torch.aten.div.Tensor"(%21845, %17385) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21846, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21847 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21848 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21849 = "torch.aten.clamp"(%21846, %21847, %21848) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21849, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21850 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21851 = "torch.prims.convert_element_type"(%21849, %21850) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21851, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %21852 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21853 = "torch.aten.unsqueeze"(%17387, %21852) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %21854 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21855 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21856 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21857 = "torch.prim.ListConstruct"(%21854, %21855, %21856) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21858 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21859 = "torch.aten.expand"(%21853, %21857, %21858) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %21860 = "torch_c.to_builtin_tensor"(%21851) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %21861 = "torch_c.to_builtin_tensor"(%21859) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %21862 = "util.call"(%21860, %21861) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21863 = "torch_c.from_builtin_tensor"(%21862) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21863, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21864 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21865 = "torch.prims.convert_element_type"(%21863, %21864) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21865, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21867 = "torch.aten.add.Tensor"(%21784, %21865, %21866) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21867, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21868 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21869 = "torch.prims.convert_element_type"(%21867, %21868) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21869, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21870 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21871 = "torch.aten.pow.Tensor_Scalar"(%21869, %21870) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21871, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21872 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21873 = "torch.prim.ListConstruct"(%21872) : (!torch.int) -> !torch.list<int>
    %21874 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21875 = "torch.constant.none"() : () -> !torch.none
    %21876 = "torch.aten.mean.dim"(%21871, %21873, %21874, %21875) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21876, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21877 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21879 = "torch.aten.add.Scalar"(%21876, %21877, %21878) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21879, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21880 = "torch.aten.rsqrt"(%21879) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21880, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21881 = "torch.aten.mul.Tensor"(%21869, %21880) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21881, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21882 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21883 = "torch.prims.convert_element_type"(%21881, %21882) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21883, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21884 = "torch.aten.mul.Tensor"(%17389, %21883) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21884, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21885 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21886 = "torch.prims.convert_element_type"(%21884, %21885) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21886, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21887 = "torch.aten.div.Tensor"(%21886, %17391) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21887, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21888 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21889 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21890 = "torch.aten.clamp"(%21887, %21888, %21889) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21890, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21891 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21892 = "torch.prims.convert_element_type"(%21890, %21891) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21892, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21894 = "torch.aten.unsqueeze"(%17393, %21893) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21895 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21896 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21897 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21898 = "torch.prim.ListConstruct"(%21895, %21896, %21897) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21899 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21900 = "torch.aten.expand"(%21894, %21898, %21899) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21901 = "torch_c.to_builtin_tensor"(%21892) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21902 = "torch_c.to_builtin_tensor"(%21900) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21903 = "util.call"(%21901, %21902) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21904 = "torch_c.from_builtin_tensor"(%21903) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21904, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21905 = "torch.aten.div.Tensor"(%21904, %17395) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21905, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21906 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21907 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21908 = "torch.aten.clamp"(%21905, %21906, %21907) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21908, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21909 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21910 = "torch.prims.convert_element_type"(%21908, %21909) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21910, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21911 = "torch.aten.div.Tensor"(%21886, %17397) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21911, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21912 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21913 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21914 = "torch.aten.clamp"(%21911, %21912, %21913) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21914, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21915 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21916 = "torch.prims.convert_element_type"(%21914, %21915) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21916, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21917 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21918 = "torch.aten.unsqueeze"(%17399, %21917) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21919 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21920 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21921 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21922 = "torch.prim.ListConstruct"(%21919, %21920, %21921) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21923 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21924 = "torch.aten.expand"(%21918, %21922, %21923) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21925 = "torch_c.to_builtin_tensor"(%21916) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21926 = "torch_c.to_builtin_tensor"(%21924) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21927 = "util.call"(%21925, %21926) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21928 = "torch_c.from_builtin_tensor"(%21927) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21928, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21929 = "torch.aten.div.Tensor"(%21928, %17401) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21929, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21930 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21931 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21932 = "torch.aten.clamp"(%21929, %21930, %21931) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21932, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21933 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21934 = "torch.prims.convert_element_type"(%21932, %21933) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21934, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21935 = "torch.aten.div.Tensor"(%21886, %17403) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21935, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21936 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21937 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21938 = "torch.aten.clamp"(%21935, %21936, %21937) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21938, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21939 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21940 = "torch.prims.convert_element_type"(%21938, %21939) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21940, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21941 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21942 = "torch.aten.unsqueeze"(%17405, %21941) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21943 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21944 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21945 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21946 = "torch.prim.ListConstruct"(%21943, %21944, %21945) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21947 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21948 = "torch.aten.expand"(%21942, %21946, %21947) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21949 = "torch_c.to_builtin_tensor"(%21940) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21950 = "torch_c.to_builtin_tensor"(%21948) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21951 = "util.call"(%21949, %21950) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21952 = "torch_c.from_builtin_tensor"(%21951) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21952, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21953 = "torch.aten.div.Tensor"(%21952, %17407) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21953, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21954 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21955 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21956 = "torch.aten.clamp"(%21953, %21954, %21955) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21956, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21957 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21958 = "torch.prims.convert_element_type"(%21956, %21957) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21958, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21960 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21961 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21962 = "torch.prim.ListConstruct"(%21959, %18481, %21960, %21961) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21963 = "torch.aten.view"(%21910, %21962) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21963, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21964 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21965 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21966 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21967 = "torch.prim.ListConstruct"(%21964, %18481, %21965, %21966) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21968 = "torch.aten.view"(%21934, %21967) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21968, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21969 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21970 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21971 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21972 = "torch.prim.ListConstruct"(%21969, %18481, %21970, %21971) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21973 = "torch.aten.view"(%21958, %21972) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21973, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21974 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21975 = "torch.constant.none"() : () -> !torch.none
    %21976 = "torch.constant.none"() : () -> !torch.none
    %21977 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21978 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21979 = "torch.aten.arange"(%21974, %21975, %21976, %21977, %21978) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %21980 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21981 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21982 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21983 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21984 = "torch.constant.none"() : () -> !torch.none
    %21985 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21986 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21987 = "torch.aten.arange.start_step"(%21980, %21981, %21982, %21983, %21984, %21985, %21986) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %21988 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21989 = "torch.prims.convert_element_type"(%21987, %21988) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %21990 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21991 = "torch.aten.div.Scalar"(%21989, %21990) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21992 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %21993 = "torch.aten.pow.Scalar"(%21992, %21991) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21994 = "torch.aten.reciprocal"(%21993) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21995 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %21996 = "torch.aten.mul.Scalar"(%21994, %21995) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21997 = "torch.aten.reciprocal"(%21996) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21998 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %21999 = "torch.aten.mul.Scalar"(%21997, %21998) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22000 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22001 = "torch.aten.gt.Scalar"(%21999, %22000) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22002 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22003 = "torch.aten.div.Scalar"(%21996, %22002) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22004 = "torch.aten.where.self"(%22001, %22003, %21996) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22005 = "torch.aten.reciprocal"(%21999) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22006 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22007 = "torch.aten.mul.Scalar"(%22005, %22006) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22008 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22009 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22010 = "torch.aten.sub.Scalar"(%22007, %22008, %22009) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22011 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22012 = "torch.aten.div.Scalar"(%22010, %22011) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22013 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22014 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22015 = "torch.aten.rsub.Scalar"(%22012, %22013, %22014) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22016 = "torch.aten.mul.Tensor"(%22015, %22004) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22017 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22018 = "torch.aten.div.Scalar"(%22016, %22017) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22019 = "torch.aten.mul.Tensor"(%22012, %22004) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22020 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22021 = "torch.aten.add.Tensor"(%22018, %22019, %22020) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22022 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22023 = "torch.aten.lt.Scalar"(%21999, %22022) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22024 = "torch.aten.bitwise_not"(%22023) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22025 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22026 = "torch.aten.gt.Scalar"(%21999, %22025) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22027 = "torch.aten.bitwise_not"(%22026) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22028 = "torch.aten.mul.Tensor"(%22024, %22027) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22029 = "torch.aten.where.self"(%22028, %22021, %22004) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22030 = "torch.prim.ListConstruct"(%22029, %22029) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22031 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22032 = "torch.aten.cat"(%22030, %22031) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22033 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22034 = "torch.prims.convert_element_type"(%21979, %22033) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22035 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22036 = "torch.prims.convert_element_type"(%22032, %22035) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22037 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22038 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22039 = "torch.prim.ListConstruct"(%22037, %22038) : (!torch.int, !torch.int) -> !torch.list<int>
    %22040 = "torch.aten.view"(%22034, %22039) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22041 = "torch.aten.mul.Tensor"(%22040, %22036) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22042 = "torch.aten.cos"(%22041) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22043 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22044 = "torch.prims.convert_element_type"(%22042, %22043) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22045 = "torch.aten.sin"(%22041) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22046 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22047 = "torch.prims.convert_element_type"(%22045, %22046) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22048 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22049 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22050 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22051 = "torch.aten.slice.Tensor"(%22044, %22048, %22049, %18481, %22050) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22051, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22052 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22053 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22054 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22055 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22056 = "torch.aten.slice.Tensor"(%22051, %22052, %22053, %22054, %22055) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22056, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22057 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22058 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22059 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22060 = "torch.aten.slice.Tensor"(%22047, %22057, %22058, %18481, %22059) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22060, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22061 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22062 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22063 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22064 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22065 = "torch.aten.slice.Tensor"(%22060, %22061, %22062, %22063, %22064) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22065, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22066 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22067 = "torch.aten.unsqueeze"(%22056, %22066) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22067, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22068 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22069 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22070 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22071 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22072 = "torch.aten.slice.Tensor"(%22067, %22068, %22069, %22070, %22071) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22072, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22073 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22074 = "torch.aten.unsqueeze"(%22072, %22073) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22074, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22075 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22076 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22077 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22078 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22079 = "torch.aten.slice.Tensor"(%22074, %22075, %22076, %22077, %22078) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22079, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22080 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22081 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22082 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22083 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22084 = "torch.prim.ListConstruct"(%22080, %22081, %22082, %22083) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22085 = "torch.aten.repeat"(%22079, %22084) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22085, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22086 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22087 = "torch.aten.unsqueeze"(%22065, %22086) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22087, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22088 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22089 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22090 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22091 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22092 = "torch.aten.slice.Tensor"(%22087, %22088, %22089, %22090, %22091) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22092, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22093 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22094 = "torch.aten.unsqueeze"(%22092, %22093) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22094, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22095 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22096 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22097 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22098 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22099 = "torch.aten.slice.Tensor"(%22094, %22095, %22096, %22097, %22098) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22099, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22100 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22101 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22102 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22103 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22104 = "torch.prim.ListConstruct"(%22100, %22101, %22102, %22103) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22105 = "torch.aten.repeat"(%22099, %22104) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22105, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22106 = "torch.aten.mul.Tensor"(%21963, %22085) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22106, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22107 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22108 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22109 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22110 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22111 = "torch.aten.slice.Tensor"(%21963, %22107, %22108, %22109, %22110) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22111, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22112 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22113 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22114 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22115 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22116 = "torch.aten.slice.Tensor"(%21963, %22112, %22113, %22114, %22115) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22116, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22117 = "torch.aten.neg"(%22116) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22117, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22118 = "torch.prim.ListConstruct"(%22117, %22111) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22119 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22120 = "torch.aten.cat"(%22118, %22119) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22120, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22121 = "torch.aten.mul.Tensor"(%22120, %22105) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22121, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22122 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22123 = "torch.aten.add.Tensor"(%22106, %22121, %22122) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22123, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22124 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22125 = "torch.constant.none"() : () -> !torch.none
    %22126 = "torch.constant.none"() : () -> !torch.none
    %22127 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22128 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22129 = "torch.aten.arange"(%22124, %22125, %22126, %22127, %22128) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %22130 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22131 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22132 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22133 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22134 = "torch.constant.none"() : () -> !torch.none
    %22135 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22136 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22137 = "torch.aten.arange.start_step"(%22130, %22131, %22132, %22133, %22134, %22135, %22136) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %22138 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22139 = "torch.prims.convert_element_type"(%22137, %22138) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %22140 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22141 = "torch.aten.div.Scalar"(%22139, %22140) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22142 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %22143 = "torch.aten.pow.Scalar"(%22142, %22141) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22144 = "torch.aten.reciprocal"(%22143) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22145 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %22146 = "torch.aten.mul.Scalar"(%22144, %22145) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22147 = "torch.aten.reciprocal"(%22146) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22148 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %22149 = "torch.aten.mul.Scalar"(%22147, %22148) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22150 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22151 = "torch.aten.gt.Scalar"(%22149, %22150) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22152 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22153 = "torch.aten.div.Scalar"(%22146, %22152) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22154 = "torch.aten.where.self"(%22151, %22153, %22146) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22155 = "torch.aten.reciprocal"(%22149) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22156 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22157 = "torch.aten.mul.Scalar"(%22155, %22156) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22158 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22159 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22160 = "torch.aten.sub.Scalar"(%22157, %22158, %22159) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22161 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22162 = "torch.aten.div.Scalar"(%22160, %22161) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22163 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22164 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22165 = "torch.aten.rsub.Scalar"(%22162, %22163, %22164) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22166 = "torch.aten.mul.Tensor"(%22165, %22154) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22167 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22168 = "torch.aten.div.Scalar"(%22166, %22167) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22169 = "torch.aten.mul.Tensor"(%22162, %22154) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22170 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22171 = "torch.aten.add.Tensor"(%22168, %22169, %22170) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22172 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22173 = "torch.aten.lt.Scalar"(%22149, %22172) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22174 = "torch.aten.bitwise_not"(%22173) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22175 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22176 = "torch.aten.gt.Scalar"(%22149, %22175) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22177 = "torch.aten.bitwise_not"(%22176) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22178 = "torch.aten.mul.Tensor"(%22174, %22177) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22179 = "torch.aten.where.self"(%22178, %22171, %22154) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22180 = "torch.prim.ListConstruct"(%22179, %22179) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22181 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22182 = "torch.aten.cat"(%22180, %22181) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22183 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22184 = "torch.prims.convert_element_type"(%22129, %22183) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22185 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22186 = "torch.prims.convert_element_type"(%22182, %22185) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22187 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22188 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22189 = "torch.prim.ListConstruct"(%22187, %22188) : (!torch.int, !torch.int) -> !torch.list<int>
    %22190 = "torch.aten.view"(%22184, %22189) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22191 = "torch.aten.mul.Tensor"(%22190, %22186) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22192 = "torch.aten.cos"(%22191) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22193 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22194 = "torch.prims.convert_element_type"(%22192, %22193) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22195 = "torch.aten.sin"(%22191) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22196 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22197 = "torch.prims.convert_element_type"(%22195, %22196) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22198 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22199 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22200 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22201 = "torch.aten.slice.Tensor"(%22194, %22198, %22199, %18481, %22200) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22201, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22202 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22203 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22204 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22205 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22206 = "torch.aten.slice.Tensor"(%22201, %22202, %22203, %22204, %22205) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22206, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22207 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22208 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22210 = "torch.aten.slice.Tensor"(%22197, %22207, %22208, %18481, %22209) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22210, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22211 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22212 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22213 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22215 = "torch.aten.slice.Tensor"(%22210, %22211, %22212, %22213, %22214) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22215, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22216 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22217 = "torch.aten.unsqueeze"(%22206, %22216) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22217, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22218 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22219 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22220 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22222 = "torch.aten.slice.Tensor"(%22217, %22218, %22219, %22220, %22221) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22222, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22223 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22224 = "torch.aten.unsqueeze"(%22222, %22223) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22224, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22225 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22226 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22227 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22228 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22229 = "torch.aten.slice.Tensor"(%22224, %22225, %22226, %22227, %22228) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22229, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22230 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22232 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22233 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22234 = "torch.prim.ListConstruct"(%22230, %22231, %22232, %22233) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22235 = "torch.aten.repeat"(%22229, %22234) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22235, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22237 = "torch.aten.unsqueeze"(%22215, %22236) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22237, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22239 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22240 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22241 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22242 = "torch.aten.slice.Tensor"(%22237, %22238, %22239, %22240, %22241) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22242, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22243 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22244 = "torch.aten.unsqueeze"(%22242, %22243) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22244, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22245 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22246 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22247 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22248 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22249 = "torch.aten.slice.Tensor"(%22244, %22245, %22246, %22247, %22248) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22249, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22250 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22251 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22252 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22253 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22254 = "torch.prim.ListConstruct"(%22250, %22251, %22252, %22253) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22255 = "torch.aten.repeat"(%22249, %22254) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22255, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22256 = "torch.aten.mul.Tensor"(%21968, %22235) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22256, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22257 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22258 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22259 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22260 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22261 = "torch.aten.slice.Tensor"(%21968, %22257, %22258, %22259, %22260) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22261, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22262 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22263 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22264 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22265 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22266 = "torch.aten.slice.Tensor"(%21968, %22262, %22263, %22264, %22265) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22266, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22267 = "torch.aten.neg"(%22266) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22267, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22268 = "torch.prim.ListConstruct"(%22267, %22261) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22269 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22270 = "torch.aten.cat"(%22268, %22269) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22270, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22271 = "torch.aten.mul.Tensor"(%22270, %22255) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22271, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22272 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22273 = "torch.aten.add.Tensor"(%22256, %22271, %22272) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22273, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22274 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22275 = "torch.aten.mul.Scalar"(%arg69, %22274) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22275, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22276 = "torch.constant.int"() <{value = 10 : i64}> : () -> !torch.int
    %22277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22278 = "torch.aten.add.Scalar"(%22275, %22276, %22277) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22278, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22279 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22280 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22281 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22282 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22283 = "torch.prim.ListConstruct"(%22279, %18477, %22280, %22281, %22282) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22284 = "torch.aten.view"(%22273, %22283) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22284, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22285 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22286 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22287 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22288 = "torch.prim.ListConstruct"(%19011, %22285, %22286, %22287) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22289 = "torch.aten.view"(%22284, %22288) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22289, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22290 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %22291 = "torch.aten.view"(%22278, %22290) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%22291, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %22292 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22293 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22294 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22295 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22296 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22297 = "torch.prim.ListConstruct"(%18479, %22292, %22293, %22294, %22295, %22296) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22298 = "torch.aten.view"(%21700, %22297) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22298, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22299 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22300 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22301 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22302 = "torch.prim.ListConstruct"(%18993, %22299, %22300, %22301) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22303 = "torch.aten.view"(%22298, %22302) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22303, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22304 = "torch.prim.ListConstruct"(%22291) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %22305 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22306 = "torch.aten.index_put"(%22303, %22304, %22289, %22305) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22306, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22307 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22308 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22309 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22310 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22311 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22312 = "torch.prim.ListConstruct"(%18479, %22307, %22308, %22309, %22310, %22311) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22313 = "torch.aten.view"(%22306, %22312) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22313, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22314 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %22315 = "torch.prim.ListConstruct"(%18479, %22314) : (!torch.int, !torch.int) -> !torch.list<int>
    %22316 = "torch.aten.view"(%22313, %22315) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22316, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %22317 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22318 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22319 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22320 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22321 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22322 = "torch.prim.ListConstruct"(%18479, %22317, %22318, %22319, %22320, %22321) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22323 = "torch.aten.view"(%22316, %22322) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22323, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22324 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22325 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22326 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22327 = "torch.prim.ListConstruct"(%18993, %22324, %22325, %22326) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22328 = "torch.aten.view"(%22323, %22327) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22328, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22329 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22330 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22331 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22332 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22333 = "torch.prim.ListConstruct"(%22329, %18477, %22330, %22331, %22332) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22334 = "torch.aten.view"(%21973, %22333) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22334, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22335 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22336 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22337 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22338 = "torch.prim.ListConstruct"(%19011, %22335, %22336, %22337) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22339 = "torch.aten.view"(%22334, %22338) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22339, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22340 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22341 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22342 = "torch.aten.add.Scalar"(%22278, %22340, %22341) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22342, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22343 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %22344 = "torch.aten.view"(%22342, %22343) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%22344, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %22345 = "torch.prim.ListConstruct"(%22344) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %22346 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22347 = "torch.aten.index_put"(%22328, %22345, %22339, %22346) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22347, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22348 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22349 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22350 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22351 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22352 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22353 = "torch.prim.ListConstruct"(%18479, %22348, %22349, %22350, %22351, %22352) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22354 = "torch.aten.view"(%22347, %22353) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22354, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22355 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %22356 = "torch.prim.ListConstruct"(%18479, %22355) : (!torch.int, !torch.int) -> !torch.list<int>
    %22357 = "torch.aten.view"(%22354, %22356) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22357, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %22358 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %22359 = "torch.aten.unsqueeze"(%22273, %22358) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22359, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22360 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22361 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22362 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22363 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22364 = "torch.prim.ListConstruct"(%22360, %18481, %22361, %22362, %22363) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22365 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22366 = "torch.aten.expand"(%22359, %22364, %22365) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22366, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22367 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22368 = "torch.aten.clone"(%22366, %22367) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22368, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22369 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22370 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22371 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22372 = "torch.prim.ListConstruct"(%22369, %18481, %22370, %22371) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22373 = "torch.aten._unsafe_view"(%22368, %22372) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22373, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22374 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %22375 = "torch.aten.unsqueeze"(%21973, %22374) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22375, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22376 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22377 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22378 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22379 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22380 = "torch.prim.ListConstruct"(%22376, %18481, %22377, %22378, %22379) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22381 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22382 = "torch.aten.expand"(%22375, %22380, %22381) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22382, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22383 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22384 = "torch.aten.clone"(%22382, %22383) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22384, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22385 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22386 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22387 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22388 = "torch.prim.ListConstruct"(%22385, %18481, %22386, %22387) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22389 = "torch.aten._unsafe_view"(%22384, %22388) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22389, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22390 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22391 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22392 = "torch.aten.transpose.int"(%22123, %22390, %22391) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22392, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22393 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22394 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22395 = "torch.aten.transpose.int"(%22373, %22393, %22394) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22395, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22396 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22397 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22398 = "torch.aten.transpose.int"(%22389, %22396, %22397) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22398, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22399 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22400 = "torch.aten.squeeze.dim"(%18570, %22399) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22400, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %22401 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22402 = "torch.aten.squeeze.dim"(%22400, %22401) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22402, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %22403 = "torch_c.to_builtin_tensor"(%22392) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %22404 = "torch_c.to_builtin_tensor"(%22395) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %22405 = "torch_c.to_builtin_tensor"(%22398) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %22406 = "torch_c.to_builtin_tensor"(%22402) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %22407 = "tensor.cast"(%22406) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %22408 = "torch_c.to_builtin_tensor"(%17409) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %22409 = "util.call"(%22403, %22404, %22405, %22408, %22407) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %22410 = "torch_c.from_builtin_tensor"(%22409) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%22410, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %22411 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22412 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22413 = "torch.aten.transpose.int"(%22410, %22411, %22412) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%22413, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %22414 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22415 = "torch.aten.clone"(%22413, %22414) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%22415, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %22416 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22417 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22418 = "torch.prim.ListConstruct"(%22416, %18481, %22417) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22419 = "torch.aten._unsafe_view"(%22415, %22418) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22419, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22420 = "torch.aten.div.Tensor"(%22419, %17411) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22420, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22421 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22422 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22423 = "torch.aten.clamp"(%22420, %22421, %22422) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22423, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22424 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22425 = "torch.prims.convert_element_type"(%22423, %22424) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22425, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22426 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22427 = "torch.aten.unsqueeze"(%17413, %22426) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %22428 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22429 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22430 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22431 = "torch.prim.ListConstruct"(%22428, %22429, %22430) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22432 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22433 = "torch.aten.expand"(%22427, %22431, %22432) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %22434 = "torch_c.to_builtin_tensor"(%22425) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22435 = "torch_c.to_builtin_tensor"(%22433) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %22436 = "util.call"(%22434, %22435) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %22437 = "torch_c.from_builtin_tensor"(%22436) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22437, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22438 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22439 = "torch.prims.convert_element_type"(%22437, %22438) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22439, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22441 = "torch.aten.add.Tensor"(%21867, %22439, %22440) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22441, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22442 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22443 = "torch.prims.convert_element_type"(%22441, %22442) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22443, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22444 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22445 = "torch.aten.pow.Tensor_Scalar"(%22443, %22444) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22445, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22446 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22447 = "torch.prim.ListConstruct"(%22446) : (!torch.int) -> !torch.list<int>
    %22448 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %22449 = "torch.constant.none"() : () -> !torch.none
    %22450 = "torch.aten.mean.dim"(%22445, %22447, %22448, %22449) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22450, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22451 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %22452 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22453 = "torch.aten.add.Scalar"(%22450, %22451, %22452) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22453, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22454 = "torch.aten.rsqrt"(%22453) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22454, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22455 = "torch.aten.mul.Tensor"(%22443, %22454) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22455, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22456 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22457 = "torch.prims.convert_element_type"(%22455, %22456) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22457, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22458 = "torch.aten.mul.Tensor"(%17415, %22457) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22458, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22459 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22460 = "torch.prims.convert_element_type"(%22458, %22459) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22460, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22461 = "torch.aten.div.Tensor"(%22460, %17417) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22461, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22462 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22463 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22464 = "torch.aten.clamp"(%22461, %22462, %22463) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22464, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22465 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22466 = "torch.prims.convert_element_type"(%22464, %22465) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22466, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22467 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22468 = "torch.aten.unsqueeze"(%17419, %22467) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %22469 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22470 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %22471 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22472 = "torch.prim.ListConstruct"(%22469, %22470, %22471) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22473 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22474 = "torch.aten.expand"(%22468, %22472, %22473) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %22475 = "torch_c.to_builtin_tensor"(%22466) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22476 = "torch_c.to_builtin_tensor"(%22474) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %22477 = "util.call"(%22475, %22476) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %22478 = "torch_c.from_builtin_tensor"(%22477) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%22478, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %22479 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22480 = "torch.prims.convert_element_type"(%22478, %22479) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22480, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22481 = "torch.aten.silu"(%22480) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22481, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22482 = "torch.aten.div.Tensor"(%22460, %17421) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22482, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22483 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22484 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22485 = "torch.aten.clamp"(%22482, %22483, %22484) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22485, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22486 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22487 = "torch.prims.convert_element_type"(%22485, %22486) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22487, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22488 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22489 = "torch.aten.unsqueeze"(%17423, %22488) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %22490 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22491 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %22492 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22493 = "torch.prim.ListConstruct"(%22490, %22491, %22492) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22494 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22495 = "torch.aten.expand"(%22489, %22493, %22494) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %22496 = "torch_c.to_builtin_tensor"(%22487) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22497 = "torch_c.to_builtin_tensor"(%22495) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %22498 = "util.call"(%22496, %22497) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %22499 = "torch_c.from_builtin_tensor"(%22498) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%22499, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %22500 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22501 = "torch.prims.convert_element_type"(%22499, %22500) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22501, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22502 = "torch.aten.mul.Tensor"(%22481, %22501) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22502, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22503 = "torch.aten.div.Tensor"(%22502, %17425) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22503, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22504 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22505 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22506 = "torch.aten.clamp"(%22503, %22504, %22505) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22506, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22507 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22508 = "torch.prims.convert_element_type"(%22506, %22507) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22508, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %22509 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22510 = "torch.aten.unsqueeze"(%17427, %22509) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %22511 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22512 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22513 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %22514 = "torch.prim.ListConstruct"(%22511, %22512, %22513) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22515 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22516 = "torch.aten.expand"(%22510, %22514, %22515) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %22517 = "torch_c.to_builtin_tensor"(%22508) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %22518 = "torch_c.to_builtin_tensor"(%22516) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %22519 = "util.call"(%22517, %22518) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %22520 = "torch_c.from_builtin_tensor"(%22519) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22520, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22521 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22522 = "torch.prims.convert_element_type"(%22520, %22521) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22522, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22523 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22524 = "torch.aten.add.Tensor"(%22441, %22522, %22523) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22524, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22525 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22526 = "torch.prims.convert_element_type"(%22524, %22525) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22526, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22527 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22528 = "torch.aten.pow.Tensor_Scalar"(%22526, %22527) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22528, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22529 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22530 = "torch.prim.ListConstruct"(%22529) : (!torch.int) -> !torch.list<int>
    %22531 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %22532 = "torch.constant.none"() : () -> !torch.none
    %22533 = "torch.aten.mean.dim"(%22528, %22530, %22531, %22532) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22533, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22534 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %22535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22536 = "torch.aten.add.Scalar"(%22533, %22534, %22535) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22536, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22537 = "torch.aten.rsqrt"(%22536) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22537, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22538 = "torch.aten.mul.Tensor"(%22526, %22537) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22538, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22539 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22540 = "torch.prims.convert_element_type"(%22538, %22539) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22540, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22541 = "torch.aten.mul.Tensor"(%17429, %22540) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22541, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22542 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22543 = "torch.prims.convert_element_type"(%22541, %22542) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22543, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22544 = "torch.aten.div.Tensor"(%22543, %17431) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22544, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22545 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22546 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22547 = "torch.aten.clamp"(%22544, %22545, %22546) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22547, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22548 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22549 = "torch.prims.convert_element_type"(%22547, %22548) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22549, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22550 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22551 = "torch.aten.unsqueeze"(%17433, %22550) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %22552 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22553 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22554 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22555 = "torch.prim.ListConstruct"(%22552, %22553, %22554) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22556 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22557 = "torch.aten.expand"(%22551, %22555, %22556) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %22558 = "torch_c.to_builtin_tensor"(%22549) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22559 = "torch_c.to_builtin_tensor"(%22557) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %22560 = "util.call"(%22558, %22559) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %22561 = "torch_c.from_builtin_tensor"(%22560) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22561, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22562 = "torch.aten.div.Tensor"(%22561, %17435) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22562, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22563 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22564 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22565 = "torch.aten.clamp"(%22562, %22563, %22564) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22565, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22566 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22567 = "torch.prims.convert_element_type"(%22565, %22566) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22567, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22568 = "torch.aten.div.Tensor"(%22543, %17437) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22568, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22569 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22570 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22571 = "torch.aten.clamp"(%22568, %22569, %22570) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22571, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22572 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22573 = "torch.prims.convert_element_type"(%22571, %22572) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22573, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22574 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22575 = "torch.aten.unsqueeze"(%17439, %22574) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %22576 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22577 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %22578 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22579 = "torch.prim.ListConstruct"(%22576, %22577, %22578) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22580 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22581 = "torch.aten.expand"(%22575, %22579, %22580) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %22582 = "torch_c.to_builtin_tensor"(%22573) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22583 = "torch_c.to_builtin_tensor"(%22581) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %22584 = "util.call"(%22582, %22583) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %22585 = "torch_c.from_builtin_tensor"(%22584) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22585, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22586 = "torch.aten.div.Tensor"(%22585, %17441) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22586, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22587 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22588 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22589 = "torch.aten.clamp"(%22586, %22587, %22588) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22589, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22590 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22591 = "torch.prims.convert_element_type"(%22589, %22590) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22591, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %22592 = "torch.aten.div.Tensor"(%22543, %17443) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22592, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22593 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22594 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22595 = "torch.aten.clamp"(%22592, %22593, %22594) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22595, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22596 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22597 = "torch.prims.convert_element_type"(%22595, %22596) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22597, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22598 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22599 = "torch.aten.unsqueeze"(%17445, %22598) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %22600 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22601 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %22602 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22603 = "torch.prim.ListConstruct"(%22600, %22601, %22602) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22604 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22605 = "torch.aten.expand"(%22599, %22603, %22604) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %22606 = "torch_c.to_builtin_tensor"(%22597) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22607 = "torch_c.to_builtin_tensor"(%22605) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %22608 = "util.call"(%22606, %22607) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %22609 = "torch_c.from_builtin_tensor"(%22608) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22609, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22610 = "torch.aten.div.Tensor"(%22609, %17447) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22610, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22611 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22612 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22613 = "torch.aten.clamp"(%22610, %22611, %22612) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22613, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22614 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22615 = "torch.prims.convert_element_type"(%22613, %22614) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22615, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %22616 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22617 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22618 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22619 = "torch.prim.ListConstruct"(%22616, %18481, %22617, %22618) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22620 = "torch.aten.view"(%22567, %22619) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22620, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22621 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22622 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22623 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22624 = "torch.prim.ListConstruct"(%22621, %18481, %22622, %22623) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22625 = "torch.aten.view"(%22591, %22624) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22625, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22626 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22627 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22628 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22629 = "torch.prim.ListConstruct"(%22626, %18481, %22627, %22628) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22630 = "torch.aten.view"(%22615, %22629) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22630, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22631 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22632 = "torch.constant.none"() : () -> !torch.none
    %22633 = "torch.constant.none"() : () -> !torch.none
    %22634 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22635 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22636 = "torch.aten.arange"(%22631, %22632, %22633, %22634, %22635) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %22637 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22638 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22639 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22640 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22641 = "torch.constant.none"() : () -> !torch.none
    %22642 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22643 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22644 = "torch.aten.arange.start_step"(%22637, %22638, %22639, %22640, %22641, %22642, %22643) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %22645 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22646 = "torch.prims.convert_element_type"(%22644, %22645) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %22647 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22648 = "torch.aten.div.Scalar"(%22646, %22647) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22649 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %22650 = "torch.aten.pow.Scalar"(%22649, %22648) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22651 = "torch.aten.reciprocal"(%22650) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22652 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %22653 = "torch.aten.mul.Scalar"(%22651, %22652) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22654 = "torch.aten.reciprocal"(%22653) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22655 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %22656 = "torch.aten.mul.Scalar"(%22654, %22655) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22657 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22658 = "torch.aten.gt.Scalar"(%22656, %22657) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22659 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22660 = "torch.aten.div.Scalar"(%22653, %22659) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22661 = "torch.aten.where.self"(%22658, %22660, %22653) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22662 = "torch.aten.reciprocal"(%22656) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22663 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22664 = "torch.aten.mul.Scalar"(%22662, %22663) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22665 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22667 = "torch.aten.sub.Scalar"(%22664, %22665, %22666) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22668 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22669 = "torch.aten.div.Scalar"(%22667, %22668) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22670 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22671 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22672 = "torch.aten.rsub.Scalar"(%22669, %22670, %22671) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22673 = "torch.aten.mul.Tensor"(%22672, %22661) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22674 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22675 = "torch.aten.div.Scalar"(%22673, %22674) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22676 = "torch.aten.mul.Tensor"(%22669, %22661) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22677 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22678 = "torch.aten.add.Tensor"(%22675, %22676, %22677) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22679 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22680 = "torch.aten.lt.Scalar"(%22656, %22679) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22681 = "torch.aten.bitwise_not"(%22680) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22682 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22683 = "torch.aten.gt.Scalar"(%22656, %22682) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22684 = "torch.aten.bitwise_not"(%22683) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22685 = "torch.aten.mul.Tensor"(%22681, %22684) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22686 = "torch.aten.where.self"(%22685, %22678, %22661) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22687 = "torch.prim.ListConstruct"(%22686, %22686) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22688 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22689 = "torch.aten.cat"(%22687, %22688) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22690 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22691 = "torch.prims.convert_element_type"(%22636, %22690) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22692 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22693 = "torch.prims.convert_element_type"(%22689, %22692) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22694 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22695 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22696 = "torch.prim.ListConstruct"(%22694, %22695) : (!torch.int, !torch.int) -> !torch.list<int>
    %22697 = "torch.aten.view"(%22691, %22696) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22698 = "torch.aten.mul.Tensor"(%22697, %22693) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22699 = "torch.aten.cos"(%22698) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22700 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22701 = "torch.prims.convert_element_type"(%22699, %22700) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22702 = "torch.aten.sin"(%22698) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22703 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22704 = "torch.prims.convert_element_type"(%22702, %22703) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22705 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22706 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22707 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22708 = "torch.aten.slice.Tensor"(%22701, %22705, %22706, %18481, %22707) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22708, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22709 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22710 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22711 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22712 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22713 = "torch.aten.slice.Tensor"(%22708, %22709, %22710, %22711, %22712) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22713, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22714 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22715 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22716 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22717 = "torch.aten.slice.Tensor"(%22704, %22714, %22715, %18481, %22716) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22717, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22718 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22719 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22720 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22721 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22722 = "torch.aten.slice.Tensor"(%22717, %22718, %22719, %22720, %22721) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22722, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22723 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22724 = "torch.aten.unsqueeze"(%22713, %22723) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22724, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22725 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22726 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22727 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22728 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22729 = "torch.aten.slice.Tensor"(%22724, %22725, %22726, %22727, %22728) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22729, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22730 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22731 = "torch.aten.unsqueeze"(%22729, %22730) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22731, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22732 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22733 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22734 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22735 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22736 = "torch.aten.slice.Tensor"(%22731, %22732, %22733, %22734, %22735) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22736, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22737 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22738 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22739 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22740 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22741 = "torch.prim.ListConstruct"(%22737, %22738, %22739, %22740) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22742 = "torch.aten.repeat"(%22736, %22741) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22742, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22743 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22744 = "torch.aten.unsqueeze"(%22722, %22743) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22744, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22745 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22746 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22747 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22748 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22749 = "torch.aten.slice.Tensor"(%22744, %22745, %22746, %22747, %22748) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22749, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22750 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22751 = "torch.aten.unsqueeze"(%22749, %22750) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22751, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22752 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22753 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22754 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22755 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22756 = "torch.aten.slice.Tensor"(%22751, %22752, %22753, %22754, %22755) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22756, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22757 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22758 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22759 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22761 = "torch.prim.ListConstruct"(%22757, %22758, %22759, %22760) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22762 = "torch.aten.repeat"(%22756, %22761) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22762, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22763 = "torch.aten.mul.Tensor"(%22620, %22742) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22763, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22764 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22765 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22766 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22767 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22768 = "torch.aten.slice.Tensor"(%22620, %22764, %22765, %22766, %22767) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22768, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22769 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22770 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22771 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22772 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22773 = "torch.aten.slice.Tensor"(%22620, %22769, %22770, %22771, %22772) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22773, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22774 = "torch.aten.neg"(%22773) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22774, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22775 = "torch.prim.ListConstruct"(%22774, %22768) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22776 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22777 = "torch.aten.cat"(%22775, %22776) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22777, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22778 = "torch.aten.mul.Tensor"(%22777, %22762) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22778, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22780 = "torch.aten.add.Tensor"(%22763, %22778, %22779) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22780, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22781 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22782 = "torch.constant.none"() : () -> !torch.none
    %22783 = "torch.constant.none"() : () -> !torch.none
    %22784 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22785 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22786 = "torch.aten.arange"(%22781, %22782, %22783, %22784, %22785) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %22787 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22788 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22789 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22790 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22791 = "torch.constant.none"() : () -> !torch.none
    %22792 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22793 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22794 = "torch.aten.arange.start_step"(%22787, %22788, %22789, %22790, %22791, %22792, %22793) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %22795 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22796 = "torch.prims.convert_element_type"(%22794, %22795) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %22797 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22798 = "torch.aten.div.Scalar"(%22796, %22797) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22799 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %22800 = "torch.aten.pow.Scalar"(%22799, %22798) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22801 = "torch.aten.reciprocal"(%22800) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22802 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %22803 = "torch.aten.mul.Scalar"(%22801, %22802) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22804 = "torch.aten.reciprocal"(%22803) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22805 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %22806 = "torch.aten.mul.Scalar"(%22804, %22805) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22807 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22808 = "torch.aten.gt.Scalar"(%22806, %22807) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22809 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22810 = "torch.aten.div.Scalar"(%22803, %22809) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22811 = "torch.aten.where.self"(%22808, %22810, %22803) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22812 = "torch.aten.reciprocal"(%22806) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22813 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22814 = "torch.aten.mul.Scalar"(%22812, %22813) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22815 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22816 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22817 = "torch.aten.sub.Scalar"(%22814, %22815, %22816) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22818 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22819 = "torch.aten.div.Scalar"(%22817, %22818) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22820 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22821 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22822 = "torch.aten.rsub.Scalar"(%22819, %22820, %22821) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22823 = "torch.aten.mul.Tensor"(%22822, %22811) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22824 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22825 = "torch.aten.div.Scalar"(%22823, %22824) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22826 = "torch.aten.mul.Tensor"(%22819, %22811) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22827 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22828 = "torch.aten.add.Tensor"(%22825, %22826, %22827) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22829 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22830 = "torch.aten.lt.Scalar"(%22806, %22829) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22831 = "torch.aten.bitwise_not"(%22830) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22832 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22833 = "torch.aten.gt.Scalar"(%22806, %22832) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22834 = "torch.aten.bitwise_not"(%22833) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22835 = "torch.aten.mul.Tensor"(%22831, %22834) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22836 = "torch.aten.where.self"(%22835, %22828, %22811) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22837 = "torch.prim.ListConstruct"(%22836, %22836) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22838 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22839 = "torch.aten.cat"(%22837, %22838) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22840 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22841 = "torch.prims.convert_element_type"(%22786, %22840) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22842 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22843 = "torch.prims.convert_element_type"(%22839, %22842) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22844 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22845 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22846 = "torch.prim.ListConstruct"(%22844, %22845) : (!torch.int, !torch.int) -> !torch.list<int>
    %22847 = "torch.aten.view"(%22841, %22846) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22848 = "torch.aten.mul.Tensor"(%22847, %22843) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22849 = "torch.aten.cos"(%22848) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22850 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22851 = "torch.prims.convert_element_type"(%22849, %22850) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22852 = "torch.aten.sin"(%22848) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22853 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22854 = "torch.prims.convert_element_type"(%22852, %22853) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22855 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22856 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22857 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22858 = "torch.aten.slice.Tensor"(%22851, %22855, %22856, %18481, %22857) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22858, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22859 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22860 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22861 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22862 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22863 = "torch.aten.slice.Tensor"(%22858, %22859, %22860, %22861, %22862) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22863, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22864 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22865 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22867 = "torch.aten.slice.Tensor"(%22854, %22864, %22865, %18481, %22866) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22867, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22868 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22869 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22870 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22871 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22872 = "torch.aten.slice.Tensor"(%22867, %22868, %22869, %22870, %22871) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22872, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22873 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22874 = "torch.aten.unsqueeze"(%22863, %22873) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22874, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22875 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22876 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22877 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22879 = "torch.aten.slice.Tensor"(%22874, %22875, %22876, %22877, %22878) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22879, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22880 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22881 = "torch.aten.unsqueeze"(%22879, %22880) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22881, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22882 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22883 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22884 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22885 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22886 = "torch.aten.slice.Tensor"(%22881, %22882, %22883, %22884, %22885) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22886, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22887 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22889 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22890 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22891 = "torch.prim.ListConstruct"(%22887, %22888, %22889, %22890) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22892 = "torch.aten.repeat"(%22886, %22891) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22892, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22894 = "torch.aten.unsqueeze"(%22872, %22893) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22894, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22895 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22896 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22897 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22898 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22899 = "torch.aten.slice.Tensor"(%22894, %22895, %22896, %22897, %22898) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22899, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22900 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22901 = "torch.aten.unsqueeze"(%22899, %22900) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22901, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22902 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22903 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22904 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22905 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22906 = "torch.aten.slice.Tensor"(%22901, %22902, %22903, %22904, %22905) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22906, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22907 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22908 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22909 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22910 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22911 = "torch.prim.ListConstruct"(%22907, %22908, %22909, %22910) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22912 = "torch.aten.repeat"(%22906, %22911) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22912, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22913 = "torch.aten.mul.Tensor"(%22625, %22892) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22913, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22914 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22915 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22916 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22917 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22918 = "torch.aten.slice.Tensor"(%22625, %22914, %22915, %22916, %22917) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22918, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22919 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22920 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22921 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22922 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22923 = "torch.aten.slice.Tensor"(%22625, %22919, %22920, %22921, %22922) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22923, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22924 = "torch.aten.neg"(%22923) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22924, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22925 = "torch.prim.ListConstruct"(%22924, %22918) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22926 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22927 = "torch.aten.cat"(%22925, %22926) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22927, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22928 = "torch.aten.mul.Tensor"(%22927, %22912) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22928, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22929 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22930 = "torch.aten.add.Tensor"(%22913, %22928, %22929) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22930, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22931 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22932 = "torch.aten.mul.Scalar"(%arg69, %22931) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22932, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22933 = "torch.constant.int"() <{value = 12 : i64}> : () -> !torch.int
    %22934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22935 = "torch.aten.add.Scalar"(%22932, %22933, %22934) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22935, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22936 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22937 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22938 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22939 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22940 = "torch.prim.ListConstruct"(%22936, %18477, %22937, %22938, %22939) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22941 = "torch.aten.view"(%22930, %22940) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22941, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22942 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22943 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22944 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22945 = "torch.prim.ListConstruct"(%19011, %22942, %22943, %22944) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22946 = "torch.aten.view"(%22941, %22945) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22946, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22947 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %22948 = "torch.aten.view"(%22935, %22947) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%22948, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %22949 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22950 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22951 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22952 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22953 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22954 = "torch.prim.ListConstruct"(%18479, %22949, %22950, %22951, %22952, %22953) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22955 = "torch.aten.view"(%22357, %22954) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22955, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22956 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22957 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22958 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22959 = "torch.prim.ListConstruct"(%18993, %22956, %22957, %22958) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22960 = "torch.aten.view"(%22955, %22959) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22960, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22961 = "torch.prim.ListConstruct"(%22948) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %22962 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22963 = "torch.aten.index_put"(%22960, %22961, %22946, %22962) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22963, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22964 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22965 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22966 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22967 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22968 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22969 = "torch.prim.ListConstruct"(%18479, %22964, %22965, %22966, %22967, %22968) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22970 = "torch.aten.view"(%22963, %22969) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22970, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22971 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %22972 = "torch.prim.ListConstruct"(%18479, %22971) : (!torch.int, !torch.int) -> !torch.list<int>
    %22973 = "torch.aten.view"(%22970, %22972) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22973, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %22974 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22975 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22976 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22977 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22978 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22979 = "torch.prim.ListConstruct"(%18479, %22974, %22975, %22976, %22977, %22978) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22980 = "torch.aten.view"(%22973, %22979) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22980, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22981 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22982 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22983 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22984 = "torch.prim.ListConstruct"(%18993, %22981, %22982, %22983) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22985 = "torch.aten.view"(%22980, %22984) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22985, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22986 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22987 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22988 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22989 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22990 = "torch.prim.ListConstruct"(%22986, %18477, %22987, %22988, %22989) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22991 = "torch.aten.view"(%22630, %22990) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22991, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22992 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22993 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22994 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22995 = "torch.prim.ListConstruct"(%19011, %22992, %22993, %22994) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22996 = "torch.aten.view"(%22991, %22995) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22996, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22997 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22998 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22999 = "torch.aten.add.Scalar"(%22935, %22997, %22998) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22999, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %23000 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %23001 = "torch.aten.view"(%22999, %23000) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%23001, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %23002 = "torch.prim.ListConstruct"(%23001) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %23003 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23004 = "torch.aten.index_put"(%22985, %23002, %22996, %23003) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23004, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23005 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23006 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23007 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23008 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23009 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23010 = "torch.prim.ListConstruct"(%18479, %23005, %23006, %23007, %23008, %23009) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23011 = "torch.aten.view"(%23004, %23010) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23011, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23012 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %23013 = "torch.prim.ListConstruct"(%18479, %23012) : (!torch.int, !torch.int) -> !torch.list<int>
    %23014 = "torch.aten.view"(%23011, %23013) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23014, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %23015 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %23016 = "torch.aten.unsqueeze"(%22930, %23015) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23016, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23017 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23018 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23019 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23020 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23021 = "torch.prim.ListConstruct"(%23017, %18481, %23018, %23019, %23020) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23022 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23023 = "torch.aten.expand"(%23016, %23021, %23022) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23023, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23024 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23025 = "torch.aten.clone"(%23023, %23024) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23025, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23026 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23027 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23028 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23029 = "torch.prim.ListConstruct"(%23026, %18481, %23027, %23028) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23030 = "torch.aten._unsafe_view"(%23025, %23029) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23030, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23031 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %23032 = "torch.aten.unsqueeze"(%22630, %23031) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23032, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23033 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23034 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23035 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23036 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23037 = "torch.prim.ListConstruct"(%23033, %18481, %23034, %23035, %23036) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23038 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23039 = "torch.aten.expand"(%23032, %23037, %23038) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23039, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23040 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23041 = "torch.aten.clone"(%23039, %23040) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23041, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23042 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23043 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23044 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23045 = "torch.prim.ListConstruct"(%23042, %18481, %23043, %23044) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23046 = "torch.aten._unsafe_view"(%23041, %23045) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23046, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23047 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23048 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23049 = "torch.aten.transpose.int"(%22780, %23047, %23048) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23049, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23050 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23051 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23052 = "torch.aten.transpose.int"(%23030, %23050, %23051) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23052, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23053 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23054 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23055 = "torch.aten.transpose.int"(%23046, %23053, %23054) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23055, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23056 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23057 = "torch.aten.squeeze.dim"(%18570, %23056) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23057, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %23058 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23059 = "torch.aten.squeeze.dim"(%23057, %23058) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23059, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %23060 = "torch_c.to_builtin_tensor"(%23049) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %23061 = "torch_c.to_builtin_tensor"(%23052) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %23062 = "torch_c.to_builtin_tensor"(%23055) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %23063 = "torch_c.to_builtin_tensor"(%23059) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %23064 = "tensor.cast"(%23063) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %23065 = "torch_c.to_builtin_tensor"(%17449) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %23066 = "util.call"(%23060, %23061, %23062, %23065, %23064) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %23067 = "torch_c.from_builtin_tensor"(%23066) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%23067, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %23068 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23069 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23070 = "torch.aten.transpose.int"(%23067, %23068, %23069) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%23070, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %23071 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23072 = "torch.aten.clone"(%23070, %23071) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%23072, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %23073 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23074 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23075 = "torch.prim.ListConstruct"(%23073, %18481, %23074) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23076 = "torch.aten._unsafe_view"(%23072, %23075) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23076, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23077 = "torch.aten.div.Tensor"(%23076, %17451) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23077, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23078 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23079 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23080 = "torch.aten.clamp"(%23077, %23078, %23079) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23080, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23081 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23082 = "torch.prims.convert_element_type"(%23080, %23081) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23082, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23083 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23084 = "torch.aten.unsqueeze"(%17453, %23083) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %23085 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23086 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23087 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23088 = "torch.prim.ListConstruct"(%23085, %23086, %23087) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23089 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23090 = "torch.aten.expand"(%23084, %23088, %23089) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %23091 = "torch_c.to_builtin_tensor"(%23082) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23092 = "torch_c.to_builtin_tensor"(%23090) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %23093 = "util.call"(%23091, %23092) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %23094 = "torch_c.from_builtin_tensor"(%23093) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23094, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23095 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23096 = "torch.prims.convert_element_type"(%23094, %23095) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23096, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23097 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23098 = "torch.aten.add.Tensor"(%22524, %23096, %23097) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23098, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23099 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23100 = "torch.prims.convert_element_type"(%23098, %23099) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23100, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23101 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23102 = "torch.aten.pow.Tensor_Scalar"(%23100, %23101) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23102, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23103 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23104 = "torch.prim.ListConstruct"(%23103) : (!torch.int) -> !torch.list<int>
    %23105 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %23106 = "torch.constant.none"() : () -> !torch.none
    %23107 = "torch.aten.mean.dim"(%23102, %23104, %23105, %23106) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23107, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23108 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %23109 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23110 = "torch.aten.add.Scalar"(%23107, %23108, %23109) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23110, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23111 = "torch.aten.rsqrt"(%23110) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23111, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23112 = "torch.aten.mul.Tensor"(%23100, %23111) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23112, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23113 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23114 = "torch.prims.convert_element_type"(%23112, %23113) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23114, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23115 = "torch.aten.mul.Tensor"(%17455, %23114) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23115, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23116 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23117 = "torch.prims.convert_element_type"(%23115, %23116) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23117, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23118 = "torch.aten.div.Tensor"(%23117, %17457) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23118, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23119 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23120 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23121 = "torch.aten.clamp"(%23118, %23119, %23120) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23121, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23122 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23123 = "torch.prims.convert_element_type"(%23121, %23122) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23123, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23124 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23125 = "torch.aten.unsqueeze"(%17459, %23124) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %23126 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23127 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %23128 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23129 = "torch.prim.ListConstruct"(%23126, %23127, %23128) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23130 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23131 = "torch.aten.expand"(%23125, %23129, %23130) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %23132 = "torch_c.to_builtin_tensor"(%23123) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23133 = "torch_c.to_builtin_tensor"(%23131) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %23134 = "util.call"(%23132, %23133) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %23135 = "torch_c.from_builtin_tensor"(%23134) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%23135, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %23136 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23137 = "torch.prims.convert_element_type"(%23135, %23136) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23137, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23138 = "torch.aten.silu"(%23137) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23138, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23139 = "torch.aten.div.Tensor"(%23117, %17461) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23139, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23140 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23141 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23142 = "torch.aten.clamp"(%23139, %23140, %23141) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23142, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23143 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23144 = "torch.prims.convert_element_type"(%23142, %23143) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23144, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23145 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23146 = "torch.aten.unsqueeze"(%17463, %23145) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %23147 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23148 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %23149 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23150 = "torch.prim.ListConstruct"(%23147, %23148, %23149) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23151 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23152 = "torch.aten.expand"(%23146, %23150, %23151) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %23153 = "torch_c.to_builtin_tensor"(%23144) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23154 = "torch_c.to_builtin_tensor"(%23152) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %23155 = "util.call"(%23153, %23154) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %23156 = "torch_c.from_builtin_tensor"(%23155) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%23156, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %23157 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23158 = "torch.prims.convert_element_type"(%23156, %23157) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23158, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23159 = "torch.aten.mul.Tensor"(%23138, %23158) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23159, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23160 = "torch.aten.div.Tensor"(%23159, %17465) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23160, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23161 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23162 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23163 = "torch.aten.clamp"(%23160, %23161, %23162) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23163, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23164 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23165 = "torch.prims.convert_element_type"(%23163, %23164) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23165, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %23166 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23167 = "torch.aten.unsqueeze"(%17467, %23166) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %23168 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23169 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23170 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %23171 = "torch.prim.ListConstruct"(%23168, %23169, %23170) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23172 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23173 = "torch.aten.expand"(%23167, %23171, %23172) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %23174 = "torch_c.to_builtin_tensor"(%23165) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %23175 = "torch_c.to_builtin_tensor"(%23173) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %23176 = "util.call"(%23174, %23175) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %23177 = "torch_c.from_builtin_tensor"(%23176) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23177, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23178 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23179 = "torch.prims.convert_element_type"(%23177, %23178) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23179, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23180 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23181 = "torch.aten.add.Tensor"(%23098, %23179, %23180) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23181, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23182 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23183 = "torch.prims.convert_element_type"(%23181, %23182) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23183, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23184 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23185 = "torch.aten.pow.Tensor_Scalar"(%23183, %23184) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23185, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23186 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23187 = "torch.prim.ListConstruct"(%23186) : (!torch.int) -> !torch.list<int>
    %23188 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %23189 = "torch.constant.none"() : () -> !torch.none
    %23190 = "torch.aten.mean.dim"(%23185, %23187, %23188, %23189) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23190, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23191 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %23192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23193 = "torch.aten.add.Scalar"(%23190, %23191, %23192) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23193, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23194 = "torch.aten.rsqrt"(%23193) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23194, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23195 = "torch.aten.mul.Tensor"(%23183, %23194) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23195, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23196 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23197 = "torch.prims.convert_element_type"(%23195, %23196) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23197, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23198 = "torch.aten.mul.Tensor"(%17469, %23197) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23198, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23199 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23200 = "torch.prims.convert_element_type"(%23198, %23199) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23200, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23201 = "torch.aten.div.Tensor"(%23200, %17471) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23201, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23202 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23203 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23204 = "torch.aten.clamp"(%23201, %23202, %23203) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23204, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23205 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23206 = "torch.prims.convert_element_type"(%23204, %23205) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23206, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23207 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23208 = "torch.aten.unsqueeze"(%17473, %23207) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %23209 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23210 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23211 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23212 = "torch.prim.ListConstruct"(%23209, %23210, %23211) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23213 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23214 = "torch.aten.expand"(%23208, %23212, %23213) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %23215 = "torch_c.to_builtin_tensor"(%23206) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23216 = "torch_c.to_builtin_tensor"(%23214) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %23217 = "util.call"(%23215, %23216) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %23218 = "torch_c.from_builtin_tensor"(%23217) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23218, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23219 = "torch.aten.div.Tensor"(%23218, %17475) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23219, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23220 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23221 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23222 = "torch.aten.clamp"(%23219, %23220, %23221) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23222, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23223 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23224 = "torch.prims.convert_element_type"(%23222, %23223) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23224, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23225 = "torch.aten.div.Tensor"(%23200, %17477) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23225, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23226 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23227 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23228 = "torch.aten.clamp"(%23225, %23226, %23227) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23228, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23229 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23230 = "torch.prims.convert_element_type"(%23228, %23229) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23230, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23231 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23232 = "torch.aten.unsqueeze"(%17479, %23231) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %23233 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23234 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %23235 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23236 = "torch.prim.ListConstruct"(%23233, %23234, %23235) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23237 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23238 = "torch.aten.expand"(%23232, %23236, %23237) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %23239 = "torch_c.to_builtin_tensor"(%23230) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23240 = "torch_c.to_builtin_tensor"(%23238) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %23241 = "util.call"(%23239, %23240) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %23242 = "torch_c.from_builtin_tensor"(%23241) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23242, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23243 = "torch.aten.div.Tensor"(%23242, %17481) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23243, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23244 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23245 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23246 = "torch.aten.clamp"(%23243, %23244, %23245) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23246, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23247 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23248 = "torch.prims.convert_element_type"(%23246, %23247) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23248, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %23249 = "torch.aten.div.Tensor"(%23200, %17483) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23249, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23250 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23251 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23252 = "torch.aten.clamp"(%23249, %23250, %23251) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23252, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23253 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23254 = "torch.prims.convert_element_type"(%23252, %23253) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23254, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23255 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23256 = "torch.aten.unsqueeze"(%17485, %23255) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %23257 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23258 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %23259 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23260 = "torch.prim.ListConstruct"(%23257, %23258, %23259) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23261 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23262 = "torch.aten.expand"(%23256, %23260, %23261) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %23263 = "torch_c.to_builtin_tensor"(%23254) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23264 = "torch_c.to_builtin_tensor"(%23262) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %23265 = "util.call"(%23263, %23264) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %23266 = "torch_c.from_builtin_tensor"(%23265) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23266, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23267 = "torch.aten.div.Tensor"(%23266, %17487) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23267, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23268 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23269 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23270 = "torch.aten.clamp"(%23267, %23268, %23269) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23270, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23271 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23272 = "torch.prims.convert_element_type"(%23270, %23271) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23272, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %23273 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23274 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23275 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23276 = "torch.prim.ListConstruct"(%23273, %18481, %23274, %23275) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23277 = "torch.aten.view"(%23224, %23276) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23277, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23278 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23279 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23280 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23281 = "torch.prim.ListConstruct"(%23278, %18481, %23279, %23280) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23282 = "torch.aten.view"(%23248, %23281) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23282, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23283 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23284 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23285 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23286 = "torch.prim.ListConstruct"(%23283, %18481, %23284, %23285) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23287 = "torch.aten.view"(%23272, %23286) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23287, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23288 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %23289 = "torch.constant.none"() : () -> !torch.none
    %23290 = "torch.constant.none"() : () -> !torch.none
    %23291 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %23292 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23293 = "torch.aten.arange"(%23288, %23289, %23290, %23291, %23292) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %23294 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23295 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23296 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23297 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23298 = "torch.constant.none"() : () -> !torch.none
    %23299 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %23300 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23301 = "torch.aten.arange.start_step"(%23294, %23295, %23296, %23297, %23298, %23299, %23300) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %23302 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23303 = "torch.prims.convert_element_type"(%23301, %23302) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %23304 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23305 = "torch.aten.div.Scalar"(%23303, %23304) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23306 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %23307 = "torch.aten.pow.Scalar"(%23306, %23305) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23308 = "torch.aten.reciprocal"(%23307) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23309 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %23310 = "torch.aten.mul.Scalar"(%23308, %23309) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %23311 = "torch.aten.reciprocal"(%23310) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23312 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %23313 = "torch.aten.mul.Scalar"(%23311, %23312) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %23314 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %23315 = "torch.aten.gt.Scalar"(%23313, %23314) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23316 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23317 = "torch.aten.div.Scalar"(%23310, %23316) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23318 = "torch.aten.where.self"(%23315, %23317, %23310) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23319 = "torch.aten.reciprocal"(%23313) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23320 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %23321 = "torch.aten.mul.Scalar"(%23319, %23320) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23322 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23323 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23324 = "torch.aten.sub.Scalar"(%23321, %23322, %23323) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %23325 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23326 = "torch.aten.div.Scalar"(%23324, %23325) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23327 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23328 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23329 = "torch.aten.rsub.Scalar"(%23326, %23327, %23328) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %23330 = "torch.aten.mul.Tensor"(%23329, %23318) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23331 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23332 = "torch.aten.div.Scalar"(%23330, %23331) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23333 = "torch.aten.mul.Tensor"(%23326, %23318) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23334 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23335 = "torch.aten.add.Tensor"(%23332, %23333, %23334) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23336 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %23337 = "torch.aten.lt.Scalar"(%23313, %23336) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23338 = "torch.aten.bitwise_not"(%23337) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23339 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %23340 = "torch.aten.gt.Scalar"(%23313, %23339) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23341 = "torch.aten.bitwise_not"(%23340) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23342 = "torch.aten.mul.Tensor"(%23338, %23341) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23343 = "torch.aten.where.self"(%23342, %23335, %23318) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23344 = "torch.prim.ListConstruct"(%23343, %23343) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %23345 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23346 = "torch.aten.cat"(%23344, %23345) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %23347 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23348 = "torch.prims.convert_element_type"(%23293, %23347) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %23349 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23350 = "torch.prims.convert_element_type"(%23346, %23349) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %23351 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %23352 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23353 = "torch.prim.ListConstruct"(%23351, %23352) : (!torch.int, !torch.int) -> !torch.list<int>
    %23354 = "torch.aten.view"(%23348, %23353) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %23355 = "torch.aten.mul.Tensor"(%23354, %23350) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %23356 = "torch.aten.cos"(%23355) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %23357 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23358 = "torch.prims.convert_element_type"(%23356, %23357) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %23359 = "torch.aten.sin"(%23355) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %23360 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23361 = "torch.prims.convert_element_type"(%23359, %23360) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %23362 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23363 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23364 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23365 = "torch.aten.slice.Tensor"(%23358, %23362, %23363, %18481, %23364) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23365, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23366 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23367 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23368 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23369 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23370 = "torch.aten.slice.Tensor"(%23365, %23366, %23367, %23368, %23369) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23370, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23371 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23372 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23373 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23374 = "torch.aten.slice.Tensor"(%23361, %23371, %23372, %18481, %23373) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23374, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23375 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23376 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23377 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23378 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23379 = "torch.aten.slice.Tensor"(%23374, %23375, %23376, %23377, %23378) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23379, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23380 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23381 = "torch.aten.unsqueeze"(%23370, %23380) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23381, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23382 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23383 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23384 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23385 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23386 = "torch.aten.slice.Tensor"(%23381, %23382, %23383, %23384, %23385) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23386, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23387 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23388 = "torch.aten.unsqueeze"(%23386, %23387) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23388, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23389 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23390 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23391 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23392 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23393 = "torch.aten.slice.Tensor"(%23388, %23389, %23390, %23391, %23392) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23393, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23394 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23395 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23396 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23397 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23398 = "torch.prim.ListConstruct"(%23394, %23395, %23396, %23397) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23399 = "torch.aten.repeat"(%23393, %23398) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23399, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %23400 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23401 = "torch.aten.unsqueeze"(%23379, %23400) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23401, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23402 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23403 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23404 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23405 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23406 = "torch.aten.slice.Tensor"(%23401, %23402, %23403, %23404, %23405) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23406, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23407 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23408 = "torch.aten.unsqueeze"(%23406, %23407) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23408, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23409 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23410 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23411 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23412 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23413 = "torch.aten.slice.Tensor"(%23408, %23409, %23410, %23411, %23412) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23413, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23414 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23415 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23416 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23417 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23418 = "torch.prim.ListConstruct"(%23414, %23415, %23416, %23417) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23419 = "torch.aten.repeat"(%23413, %23418) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23419, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %23420 = "torch.aten.mul.Tensor"(%23277, %23399) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23420, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23421 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23422 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23423 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %23424 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23425 = "torch.aten.slice.Tensor"(%23277, %23421, %23422, %23423, %23424) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23425, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %23426 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23427 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %23428 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23429 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23430 = "torch.aten.slice.Tensor"(%23277, %23426, %23427, %23428, %23429) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23430, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %23431 = "torch.aten.neg"(%23430) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23431, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %23432 = "torch.prim.ListConstruct"(%23431, %23425) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %23433 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23434 = "torch.aten.cat"(%23432, %23433) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23434, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23435 = "torch.aten.mul.Tensor"(%23434, %23419) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23435, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23437 = "torch.aten.add.Tensor"(%23420, %23435, %23436) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23437, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23438 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %23439 = "torch.constant.none"() : () -> !torch.none
    %23440 = "torch.constant.none"() : () -> !torch.none
    %23441 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %23442 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23443 = "torch.aten.arange"(%23438, %23439, %23440, %23441, %23442) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %23444 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23445 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23446 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23447 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23448 = "torch.constant.none"() : () -> !torch.none
    %23449 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %23450 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23451 = "torch.aten.arange.start_step"(%23444, %23445, %23446, %23447, %23448, %23449, %23450) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %23452 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23453 = "torch.prims.convert_element_type"(%23451, %23452) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %23454 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23455 = "torch.aten.div.Scalar"(%23453, %23454) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23456 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %23457 = "torch.aten.pow.Scalar"(%23456, %23455) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23458 = "torch.aten.reciprocal"(%23457) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23459 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %23460 = "torch.aten.mul.Scalar"(%23458, %23459) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %23461 = "torch.aten.reciprocal"(%23460) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23462 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %23463 = "torch.aten.mul.Scalar"(%23461, %23462) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %23464 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %23465 = "torch.aten.gt.Scalar"(%23463, %23464) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23466 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23467 = "torch.aten.div.Scalar"(%23460, %23466) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23468 = "torch.aten.where.self"(%23465, %23467, %23460) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23469 = "torch.aten.reciprocal"(%23463) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23470 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %23471 = "torch.aten.mul.Scalar"(%23469, %23470) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23472 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23473 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23474 = "torch.aten.sub.Scalar"(%23471, %23472, %23473) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %23475 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23476 = "torch.aten.div.Scalar"(%23474, %23475) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23477 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23478 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23479 = "torch.aten.rsub.Scalar"(%23476, %23477, %23478) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %23480 = "torch.aten.mul.Tensor"(%23479, %23468) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23481 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23482 = "torch.aten.div.Scalar"(%23480, %23481) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23483 = "torch.aten.mul.Tensor"(%23476, %23468) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23484 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23485 = "torch.aten.add.Tensor"(%23482, %23483, %23484) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23486 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %23487 = "torch.aten.lt.Scalar"(%23463, %23486) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23488 = "torch.aten.bitwise_not"(%23487) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23489 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %23490 = "torch.aten.gt.Scalar"(%23463, %23489) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23491 = "torch.aten.bitwise_not"(%23490) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23492 = "torch.aten.mul.Tensor"(%23488, %23491) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23493 = "torch.aten.where.self"(%23492, %23485, %23468) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23494 = "torch.prim.ListConstruct"(%23493, %23493) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %23495 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23496 = "torch.aten.cat"(%23494, %23495) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %23497 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23498 = "torch.prims.convert_element_type"(%23443, %23497) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %23499 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23500 = "torch.prims.convert_element_type"(%23496, %23499) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %23501 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %23502 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23503 = "torch.prim.ListConstruct"(%23501, %23502) : (!torch.int, !torch.int) -> !torch.list<int>
    %23504 = "torch.aten.view"(%23498, %23503) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %23505 = "torch.aten.mul.Tensor"(%23504, %23500) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %23506 = "torch.aten.cos"(%23505) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %23507 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23508 = "torch.prims.convert_element_type"(%23506, %23507) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %23509 = "torch.aten.sin"(%23505) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %23510 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23511 = "torch.prims.convert_element_type"(%23509, %23510) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %23512 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23513 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23514 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23515 = "torch.aten.slice.Tensor"(%23508, %23512, %23513, %18481, %23514) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23515, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23516 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23518 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23519 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23520 = "torch.aten.slice.Tensor"(%23515, %23516, %23517, %23518, %23519) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23520, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23522 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23523 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23524 = "torch.aten.slice.Tensor"(%23511, %23521, %23522, %18481, %23523) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23524, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23525 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23526 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23527 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23528 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23529 = "torch.aten.slice.Tensor"(%23524, %23525, %23526, %23527, %23528) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%23529, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %23530 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23531 = "torch.aten.unsqueeze"(%23520, %23530) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23531, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23532 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23533 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23534 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23536 = "torch.aten.slice.Tensor"(%23531, %23532, %23533, %23534, %23535) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23536, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23537 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23538 = "torch.aten.unsqueeze"(%23536, %23537) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23538, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23539 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23540 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23541 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23543 = "torch.aten.slice.Tensor"(%23538, %23539, %23540, %23541, %23542) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23543, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23544 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23545 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23546 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23547 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23548 = "torch.prim.ListConstruct"(%23544, %23545, %23546, %23547) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23549 = "torch.aten.repeat"(%23543, %23548) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23549, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %23550 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23551 = "torch.aten.unsqueeze"(%23529, %23550) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23551, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23552 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23553 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23554 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23555 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23556 = "torch.aten.slice.Tensor"(%23551, %23552, %23553, %23554, %23555) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%23556, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %23557 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23558 = "torch.aten.unsqueeze"(%23556, %23557) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23558, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23559 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23560 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23561 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23562 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23563 = "torch.aten.slice.Tensor"(%23558, %23559, %23560, %23561, %23562) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23563, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %23564 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23565 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23566 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23567 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23568 = "torch.prim.ListConstruct"(%23564, %23565, %23566, %23567) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23569 = "torch.aten.repeat"(%23563, %23568) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%23569, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %23570 = "torch.aten.mul.Tensor"(%23282, %23549) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23570, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23571 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23572 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23573 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %23574 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23575 = "torch.aten.slice.Tensor"(%23282, %23571, %23572, %23573, %23574) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23575, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %23576 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23577 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %23578 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %23579 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23580 = "torch.aten.slice.Tensor"(%23282, %23576, %23577, %23578, %23579) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23580, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %23581 = "torch.aten.neg"(%23580) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23581, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %23582 = "torch.prim.ListConstruct"(%23581, %23575) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %23583 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23584 = "torch.aten.cat"(%23582, %23583) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23584, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23585 = "torch.aten.mul.Tensor"(%23584, %23569) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23585, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23586 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23587 = "torch.aten.add.Tensor"(%23570, %23585, %23586) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23587, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23588 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %23589 = "torch.aten.mul.Scalar"(%arg69, %23588) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%23589, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %23590 = "torch.constant.int"() <{value = 14 : i64}> : () -> !torch.int
    %23591 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23592 = "torch.aten.add.Scalar"(%23589, %23590, %23591) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%23592, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %23593 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23594 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23595 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23596 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23597 = "torch.prim.ListConstruct"(%23593, %18477, %23594, %23595, %23596) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23598 = "torch.aten.view"(%23587, %23597) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23598, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23599 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23600 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23601 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23602 = "torch.prim.ListConstruct"(%19011, %23599, %23600, %23601) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23603 = "torch.aten.view"(%23598, %23602) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23603, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23604 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %23605 = "torch.aten.view"(%23592, %23604) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%23605, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %23606 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23607 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23608 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23609 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23610 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23611 = "torch.prim.ListConstruct"(%18479, %23606, %23607, %23608, %23609, %23610) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23612 = "torch.aten.view"(%23014, %23611) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23612, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23613 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23614 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23615 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23616 = "torch.prim.ListConstruct"(%18993, %23613, %23614, %23615) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23617 = "torch.aten.view"(%23612, %23616) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23617, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23618 = "torch.prim.ListConstruct"(%23605) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %23619 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23620 = "torch.aten.index_put"(%23617, %23618, %23603, %23619) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23620, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23621 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23622 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23623 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23624 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23625 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23626 = "torch.prim.ListConstruct"(%18479, %23621, %23622, %23623, %23624, %23625) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23627 = "torch.aten.view"(%23620, %23626) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23627, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23628 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %23629 = "torch.prim.ListConstruct"(%18479, %23628) : (!torch.int, !torch.int) -> !torch.list<int>
    %23630 = "torch.aten.view"(%23627, %23629) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23630, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %23631 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23632 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23633 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23634 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23635 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23636 = "torch.prim.ListConstruct"(%18479, %23631, %23632, %23633, %23634, %23635) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23637 = "torch.aten.view"(%23630, %23636) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23637, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23638 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23639 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23640 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23641 = "torch.prim.ListConstruct"(%18993, %23638, %23639, %23640) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23642 = "torch.aten.view"(%23637, %23641) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23642, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23643 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23644 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23645 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23646 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23647 = "torch.prim.ListConstruct"(%23643, %18477, %23644, %23645, %23646) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23648 = "torch.aten.view"(%23287, %23647) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23648, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23649 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23650 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23651 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23652 = "torch.prim.ListConstruct"(%19011, %23649, %23650, %23651) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23653 = "torch.aten.view"(%23648, %23652) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23653, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23654 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23655 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23656 = "torch.aten.add.Scalar"(%23592, %23654, %23655) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%23656, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %23657 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %23658 = "torch.aten.view"(%23656, %23657) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%23658, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %23659 = "torch.prim.ListConstruct"(%23658) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %23660 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23661 = "torch.aten.index_put"(%23642, %23659, %23653, %23660) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23661, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23662 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23663 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23664 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23665 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23666 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23667 = "torch.prim.ListConstruct"(%18479, %23662, %23663, %23664, %23665, %23666) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23668 = "torch.aten.view"(%23661, %23667) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23668, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23669 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %23670 = "torch.prim.ListConstruct"(%18479, %23669) : (!torch.int, !torch.int) -> !torch.list<int>
    %23671 = "torch.aten.view"(%23668, %23670) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23671, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %23672 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %23673 = "torch.aten.unsqueeze"(%23587, %23672) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23673, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23674 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23675 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23676 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23677 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23678 = "torch.prim.ListConstruct"(%23674, %18481, %23675, %23676, %23677) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23679 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23680 = "torch.aten.expand"(%23673, %23678, %23679) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23680, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23681 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23682 = "torch.aten.clone"(%23680, %23681) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23682, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23683 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23684 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23685 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23686 = "torch.prim.ListConstruct"(%23683, %18481, %23684, %23685) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23687 = "torch.aten._unsafe_view"(%23682, %23686) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23687, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23688 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %23689 = "torch.aten.unsqueeze"(%23287, %23688) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23689, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23690 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23691 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23692 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23693 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23694 = "torch.prim.ListConstruct"(%23690, %18481, %23691, %23692, %23693) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23695 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23696 = "torch.aten.expand"(%23689, %23694, %23695) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23696, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23697 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23698 = "torch.aten.clone"(%23696, %23697) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23698, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23699 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23700 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23701 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23702 = "torch.prim.ListConstruct"(%23699, %18481, %23700, %23701) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23703 = "torch.aten._unsafe_view"(%23698, %23702) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23703, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23704 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23705 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23706 = "torch.aten.transpose.int"(%23437, %23704, %23705) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23706, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23707 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23708 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23709 = "torch.aten.transpose.int"(%23687, %23707, %23708) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23709, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23710 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23711 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23712 = "torch.aten.transpose.int"(%23703, %23710, %23711) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23712, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23713 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23714 = "torch.aten.squeeze.dim"(%18570, %23713) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23714, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %23715 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23716 = "torch.aten.squeeze.dim"(%23714, %23715) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23716, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %23717 = "torch_c.to_builtin_tensor"(%23706) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %23718 = "torch_c.to_builtin_tensor"(%23709) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %23719 = "torch_c.to_builtin_tensor"(%23712) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %23720 = "torch_c.to_builtin_tensor"(%23716) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %23721 = "tensor.cast"(%23720) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %23722 = "torch_c.to_builtin_tensor"(%17489) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %23723 = "util.call"(%23717, %23718, %23719, %23722, %23721) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %23724 = "torch_c.from_builtin_tensor"(%23723) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%23724, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %23725 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23726 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23727 = "torch.aten.transpose.int"(%23724, %23725, %23726) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%23727, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %23728 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23729 = "torch.aten.clone"(%23727, %23728) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%23729, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %23730 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23731 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23732 = "torch.prim.ListConstruct"(%23730, %18481, %23731) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23733 = "torch.aten._unsafe_view"(%23729, %23732) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23733, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23734 = "torch.aten.div.Tensor"(%23733, %17491) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23734, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23735 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23736 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23737 = "torch.aten.clamp"(%23734, %23735, %23736) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23737, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23738 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23739 = "torch.prims.convert_element_type"(%23737, %23738) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23739, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23740 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23741 = "torch.aten.unsqueeze"(%17493, %23740) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %23742 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23743 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23744 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23745 = "torch.prim.ListConstruct"(%23742, %23743, %23744) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23746 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23747 = "torch.aten.expand"(%23741, %23745, %23746) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %23748 = "torch_c.to_builtin_tensor"(%23739) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23749 = "torch_c.to_builtin_tensor"(%23747) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %23750 = "util.call"(%23748, %23749) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %23751 = "torch_c.from_builtin_tensor"(%23750) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23751, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23752 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23753 = "torch.prims.convert_element_type"(%23751, %23752) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23753, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23754 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23755 = "torch.aten.add.Tensor"(%23181, %23753, %23754) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23755, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23756 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23757 = "torch.prims.convert_element_type"(%23755, %23756) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23757, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23758 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23759 = "torch.aten.pow.Tensor_Scalar"(%23757, %23758) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23759, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23760 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23761 = "torch.prim.ListConstruct"(%23760) : (!torch.int) -> !torch.list<int>
    %23762 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %23763 = "torch.constant.none"() : () -> !torch.none
    %23764 = "torch.aten.mean.dim"(%23759, %23761, %23762, %23763) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23764, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23765 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %23766 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23767 = "torch.aten.add.Scalar"(%23764, %23765, %23766) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23767, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23768 = "torch.aten.rsqrt"(%23767) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23768, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23769 = "torch.aten.mul.Tensor"(%23757, %23768) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23769, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23770 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23771 = "torch.prims.convert_element_type"(%23769, %23770) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23771, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23772 = "torch.aten.mul.Tensor"(%17495, %23771) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23772, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23773 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23774 = "torch.prims.convert_element_type"(%23772, %23773) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23774, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23775 = "torch.aten.div.Tensor"(%23774, %17497) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23775, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23776 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23777 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23778 = "torch.aten.clamp"(%23775, %23776, %23777) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23778, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23779 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23780 = "torch.prims.convert_element_type"(%23778, %23779) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23780, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23781 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23782 = "torch.aten.unsqueeze"(%17499, %23781) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %23783 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23784 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %23785 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23786 = "torch.prim.ListConstruct"(%23783, %23784, %23785) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23787 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23788 = "torch.aten.expand"(%23782, %23786, %23787) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %23789 = "torch_c.to_builtin_tensor"(%23780) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23790 = "torch_c.to_builtin_tensor"(%23788) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %23791 = "util.call"(%23789, %23790) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %23792 = "torch_c.from_builtin_tensor"(%23791) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%23792, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %23793 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23794 = "torch.prims.convert_element_type"(%23792, %23793) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23794, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23795 = "torch.aten.silu"(%23794) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23795, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23796 = "torch.aten.div.Tensor"(%23774, %17501) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23796, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23797 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23798 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23799 = "torch.aten.clamp"(%23796, %23797, %23798) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23799, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23800 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23801 = "torch.prims.convert_element_type"(%23799, %23800) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23801, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23802 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23803 = "torch.aten.unsqueeze"(%17503, %23802) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %23804 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23805 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %23806 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23807 = "torch.prim.ListConstruct"(%23804, %23805, %23806) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23808 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23809 = "torch.aten.expand"(%23803, %23807, %23808) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %23810 = "torch_c.to_builtin_tensor"(%23801) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23811 = "torch_c.to_builtin_tensor"(%23809) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %23812 = "util.call"(%23810, %23811) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %23813 = "torch_c.from_builtin_tensor"(%23812) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%23813, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %23814 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23815 = "torch.prims.convert_element_type"(%23813, %23814) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23815, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23816 = "torch.aten.mul.Tensor"(%23795, %23815) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23816, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23817 = "torch.aten.div.Tensor"(%23816, %17505) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23817, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23818 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23819 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23820 = "torch.aten.clamp"(%23817, %23818, %23819) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%23820, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %23821 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23822 = "torch.prims.convert_element_type"(%23820, %23821) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23822, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %23823 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23824 = "torch.aten.unsqueeze"(%17507, %23823) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %23825 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23826 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23827 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %23828 = "torch.prim.ListConstruct"(%23825, %23826, %23827) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23829 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23830 = "torch.aten.expand"(%23824, %23828, %23829) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %23831 = "torch_c.to_builtin_tensor"(%23822) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %23832 = "torch_c.to_builtin_tensor"(%23830) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %23833 = "util.call"(%23831, %23832) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %23834 = "torch_c.from_builtin_tensor"(%23833) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23834, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23835 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23836 = "torch.prims.convert_element_type"(%23834, %23835) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23836, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23837 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23838 = "torch.aten.add.Tensor"(%23755, %23836, %23837) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23838, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23839 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23840 = "torch.prims.convert_element_type"(%23838, %23839) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23840, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23841 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23842 = "torch.aten.pow.Tensor_Scalar"(%23840, %23841) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23842, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23843 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %23844 = "torch.prim.ListConstruct"(%23843) : (!torch.int) -> !torch.list<int>
    %23845 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %23846 = "torch.constant.none"() : () -> !torch.none
    %23847 = "torch.aten.mean.dim"(%23842, %23844, %23845, %23846) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23847, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23848 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %23849 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23850 = "torch.aten.add.Scalar"(%23847, %23848, %23849) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23850, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23851 = "torch.aten.rsqrt"(%23850) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%23851, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %23852 = "torch.aten.mul.Tensor"(%23840, %23851) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23852, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23853 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23854 = "torch.prims.convert_element_type"(%23852, %23853) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23854, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23855 = "torch.aten.mul.Tensor"(%17509, %23854) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23855, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23856 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %23857 = "torch.prims.convert_element_type"(%23855, %23856) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23857, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23858 = "torch.aten.div.Tensor"(%23857, %17511) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23858, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23859 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23860 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23861 = "torch.aten.clamp"(%23858, %23859, %23860) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23861, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23862 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23863 = "torch.prims.convert_element_type"(%23861, %23862) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23863, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23864 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23865 = "torch.aten.unsqueeze"(%17513, %23864) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %23866 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23867 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23868 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23869 = "torch.prim.ListConstruct"(%23866, %23867, %23868) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23870 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23871 = "torch.aten.expand"(%23865, %23869, %23870) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %23872 = "torch_c.to_builtin_tensor"(%23863) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23873 = "torch_c.to_builtin_tensor"(%23871) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %23874 = "util.call"(%23872, %23873) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %23875 = "torch_c.from_builtin_tensor"(%23874) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23875, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23876 = "torch.aten.div.Tensor"(%23875, %17515) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23876, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23877 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23878 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23879 = "torch.aten.clamp"(%23876, %23877, %23878) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%23879, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %23880 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23881 = "torch.prims.convert_element_type"(%23879, %23880) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23881, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23882 = "torch.aten.div.Tensor"(%23857, %17517) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23882, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23883 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23884 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23885 = "torch.aten.clamp"(%23882, %23883, %23884) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23885, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23886 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23887 = "torch.prims.convert_element_type"(%23885, %23886) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23887, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23888 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23889 = "torch.aten.unsqueeze"(%17519, %23888) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %23890 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23891 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %23892 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23893 = "torch.prim.ListConstruct"(%23890, %23891, %23892) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23894 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23895 = "torch.aten.expand"(%23889, %23893, %23894) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %23896 = "torch_c.to_builtin_tensor"(%23887) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23897 = "torch_c.to_builtin_tensor"(%23895) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %23898 = "util.call"(%23896, %23897) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %23899 = "torch_c.from_builtin_tensor"(%23898) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23899, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23900 = "torch.aten.div.Tensor"(%23899, %17521) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23900, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23901 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23902 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23903 = "torch.aten.clamp"(%23900, %23901, %23902) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23903, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23904 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23905 = "torch.prims.convert_element_type"(%23903, %23904) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23905, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %23906 = "torch.aten.div.Tensor"(%23857, %17523) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23906, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23907 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23908 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23909 = "torch.aten.clamp"(%23906, %23907, %23908) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%23909, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %23910 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23911 = "torch.prims.convert_element_type"(%23909, %23910) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23911, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %23912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23913 = "torch.aten.unsqueeze"(%17525, %23912) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %23914 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23915 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %23916 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %23917 = "torch.prim.ListConstruct"(%23914, %23915, %23916) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23918 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23919 = "torch.aten.expand"(%23913, %23917, %23918) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %23920 = "torch_c.to_builtin_tensor"(%23911) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %23921 = "torch_c.to_builtin_tensor"(%23919) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %23922 = "util.call"(%23920, %23921) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %23923 = "torch_c.from_builtin_tensor"(%23922) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23923, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23924 = "torch.aten.div.Tensor"(%23923, %17527) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23924, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23925 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %23926 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %23927 = "torch.aten.clamp"(%23924, %23925, %23926) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%23927, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %23928 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %23929 = "torch.prims.convert_element_type"(%23927, %23928) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23929, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %23930 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23931 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23932 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23933 = "torch.prim.ListConstruct"(%23930, %18481, %23931, %23932) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23934 = "torch.aten.view"(%23881, %23933) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23934, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23935 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23936 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23937 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23938 = "torch.prim.ListConstruct"(%23935, %18481, %23936, %23937) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23939 = "torch.aten.view"(%23905, %23938) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23939, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23940 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23941 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23942 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23943 = "torch.prim.ListConstruct"(%23940, %18481, %23941, %23942) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23944 = "torch.aten.view"(%23929, %23943) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23944, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23945 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %23946 = "torch.constant.none"() : () -> !torch.none
    %23947 = "torch.constant.none"() : () -> !torch.none
    %23948 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %23949 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23950 = "torch.aten.arange"(%23945, %23946, %23947, %23948, %23949) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %23951 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23952 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23953 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23954 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23955 = "torch.constant.none"() : () -> !torch.none
    %23956 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %23957 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23958 = "torch.aten.arange.start_step"(%23951, %23952, %23953, %23954, %23955, %23956, %23957) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %23959 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %23960 = "torch.prims.convert_element_type"(%23958, %23959) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %23961 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23962 = "torch.aten.div.Scalar"(%23960, %23961) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23963 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %23964 = "torch.aten.pow.Scalar"(%23963, %23962) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23965 = "torch.aten.reciprocal"(%23964) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23966 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %23967 = "torch.aten.mul.Scalar"(%23965, %23966) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %23968 = "torch.aten.reciprocal"(%23967) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23969 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %23970 = "torch.aten.mul.Scalar"(%23968, %23969) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %23971 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %23972 = "torch.aten.gt.Scalar"(%23970, %23971) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23973 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23974 = "torch.aten.div.Scalar"(%23967, %23973) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23975 = "torch.aten.where.self"(%23972, %23974, %23967) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23976 = "torch.aten.reciprocal"(%23970) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23977 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %23978 = "torch.aten.mul.Scalar"(%23976, %23977) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23979 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23980 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23981 = "torch.aten.sub.Scalar"(%23978, %23979, %23980) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %23982 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %23983 = "torch.aten.div.Scalar"(%23981, %23982) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23984 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23985 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23986 = "torch.aten.rsub.Scalar"(%23983, %23984, %23985) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %23987 = "torch.aten.mul.Tensor"(%23986, %23975) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23988 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23989 = "torch.aten.div.Scalar"(%23987, %23988) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23990 = "torch.aten.mul.Tensor"(%23983, %23975) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %23991 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %23992 = "torch.aten.add.Tensor"(%23989, %23990, %23991) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %23993 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %23994 = "torch.aten.lt.Scalar"(%23970, %23993) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23995 = "torch.aten.bitwise_not"(%23994) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23996 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %23997 = "torch.aten.gt.Scalar"(%23970, %23996) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %23998 = "torch.aten.bitwise_not"(%23997) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %23999 = "torch.aten.mul.Tensor"(%23995, %23998) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24000 = "torch.aten.where.self"(%23999, %23992, %23975) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24001 = "torch.prim.ListConstruct"(%24000, %24000) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %24002 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24003 = "torch.aten.cat"(%24001, %24002) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %24004 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24005 = "torch.prims.convert_element_type"(%23950, %24004) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %24006 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24007 = "torch.prims.convert_element_type"(%24003, %24006) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %24008 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %24009 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24010 = "torch.prim.ListConstruct"(%24008, %24009) : (!torch.int, !torch.int) -> !torch.list<int>
    %24011 = "torch.aten.view"(%24005, %24010) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %24012 = "torch.aten.mul.Tensor"(%24011, %24007) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24013 = "torch.aten.cos"(%24012) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24014 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24015 = "torch.prims.convert_element_type"(%24013, %24014) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24016 = "torch.aten.sin"(%24012) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24017 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24018 = "torch.prims.convert_element_type"(%24016, %24017) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24019 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24020 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24021 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24022 = "torch.aten.slice.Tensor"(%24015, %24019, %24020, %18481, %24021) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24022, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24023 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24024 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24025 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24026 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24027 = "torch.aten.slice.Tensor"(%24022, %24023, %24024, %24025, %24026) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24027, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24028 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24029 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24030 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24031 = "torch.aten.slice.Tensor"(%24018, %24028, %24029, %18481, %24030) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24031, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24032 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24033 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24034 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24035 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24036 = "torch.aten.slice.Tensor"(%24031, %24032, %24033, %24034, %24035) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24036, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24037 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24038 = "torch.aten.unsqueeze"(%24027, %24037) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24038, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24039 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24040 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24041 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24042 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24043 = "torch.aten.slice.Tensor"(%24038, %24039, %24040, %24041, %24042) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24043, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24044 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24045 = "torch.aten.unsqueeze"(%24043, %24044) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24045, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24046 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24047 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24048 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24049 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24050 = "torch.aten.slice.Tensor"(%24045, %24046, %24047, %24048, %24049) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24050, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24051 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24052 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24053 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24054 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24055 = "torch.prim.ListConstruct"(%24051, %24052, %24053, %24054) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24056 = "torch.aten.repeat"(%24050, %24055) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24056, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24057 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24058 = "torch.aten.unsqueeze"(%24036, %24057) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24058, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24059 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24060 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24061 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24062 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24063 = "torch.aten.slice.Tensor"(%24058, %24059, %24060, %24061, %24062) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24063, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24064 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24065 = "torch.aten.unsqueeze"(%24063, %24064) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24065, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24066 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24067 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24068 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24069 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24070 = "torch.aten.slice.Tensor"(%24065, %24066, %24067, %24068, %24069) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24070, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24071 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24072 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24073 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24074 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24075 = "torch.prim.ListConstruct"(%24071, %24072, %24073, %24074) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24076 = "torch.aten.repeat"(%24070, %24075) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24076, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24077 = "torch.aten.mul.Tensor"(%23934, %24056) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24077, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24078 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24079 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24080 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24081 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24082 = "torch.aten.slice.Tensor"(%23934, %24078, %24079, %24080, %24081) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24082, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24083 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24084 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24085 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24086 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24087 = "torch.aten.slice.Tensor"(%23934, %24083, %24084, %24085, %24086) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24087, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24088 = "torch.aten.neg"(%24087) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24088, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24089 = "torch.prim.ListConstruct"(%24088, %24082) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %24090 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24091 = "torch.aten.cat"(%24089, %24090) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24091, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24092 = "torch.aten.mul.Tensor"(%24091, %24076) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24092, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24093 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24094 = "torch.aten.add.Tensor"(%24077, %24092, %24093) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24094, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24095 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %24096 = "torch.constant.none"() : () -> !torch.none
    %24097 = "torch.constant.none"() : () -> !torch.none
    %24098 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %24099 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24100 = "torch.aten.arange"(%24095, %24096, %24097, %24098, %24099) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %24101 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24102 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24103 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24104 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24105 = "torch.constant.none"() : () -> !torch.none
    %24106 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %24107 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24108 = "torch.aten.arange.start_step"(%24101, %24102, %24103, %24104, %24105, %24106, %24107) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %24109 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24110 = "torch.prims.convert_element_type"(%24108, %24109) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %24111 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24112 = "torch.aten.div.Scalar"(%24110, %24111) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24113 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %24114 = "torch.aten.pow.Scalar"(%24113, %24112) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24115 = "torch.aten.reciprocal"(%24114) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24116 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %24117 = "torch.aten.mul.Scalar"(%24115, %24116) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %24118 = "torch.aten.reciprocal"(%24117) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24119 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %24120 = "torch.aten.mul.Scalar"(%24118, %24119) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %24121 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %24122 = "torch.aten.gt.Scalar"(%24120, %24121) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24123 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24124 = "torch.aten.div.Scalar"(%24117, %24123) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24125 = "torch.aten.where.self"(%24122, %24124, %24117) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24126 = "torch.aten.reciprocal"(%24120) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24127 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %24128 = "torch.aten.mul.Scalar"(%24126, %24127) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24129 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24130 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24131 = "torch.aten.sub.Scalar"(%24128, %24129, %24130) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %24132 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24133 = "torch.aten.div.Scalar"(%24131, %24132) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24134 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24135 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24136 = "torch.aten.rsub.Scalar"(%24133, %24134, %24135) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %24137 = "torch.aten.mul.Tensor"(%24136, %24125) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24138 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24139 = "torch.aten.div.Scalar"(%24137, %24138) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24140 = "torch.aten.mul.Tensor"(%24133, %24125) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24141 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24142 = "torch.aten.add.Tensor"(%24139, %24140, %24141) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24143 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %24144 = "torch.aten.lt.Scalar"(%24120, %24143) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24145 = "torch.aten.bitwise_not"(%24144) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24146 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %24147 = "torch.aten.gt.Scalar"(%24120, %24146) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24148 = "torch.aten.bitwise_not"(%24147) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24149 = "torch.aten.mul.Tensor"(%24145, %24148) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24150 = "torch.aten.where.self"(%24149, %24142, %24125) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24151 = "torch.prim.ListConstruct"(%24150, %24150) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %24152 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24153 = "torch.aten.cat"(%24151, %24152) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %24154 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24155 = "torch.prims.convert_element_type"(%24100, %24154) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %24156 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24157 = "torch.prims.convert_element_type"(%24153, %24156) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %24158 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %24159 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24160 = "torch.prim.ListConstruct"(%24158, %24159) : (!torch.int, !torch.int) -> !torch.list<int>
    %24161 = "torch.aten.view"(%24155, %24160) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %24162 = "torch.aten.mul.Tensor"(%24161, %24157) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24163 = "torch.aten.cos"(%24162) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24164 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24165 = "torch.prims.convert_element_type"(%24163, %24164) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24166 = "torch.aten.sin"(%24162) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24167 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24168 = "torch.prims.convert_element_type"(%24166, %24167) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24169 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24170 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24171 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24172 = "torch.aten.slice.Tensor"(%24165, %24169, %24170, %18481, %24171) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24172, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24173 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24174 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24175 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24176 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24177 = "torch.aten.slice.Tensor"(%24172, %24173, %24174, %24175, %24176) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24177, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24178 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24179 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24180 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24181 = "torch.aten.slice.Tensor"(%24168, %24178, %24179, %18481, %24180) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24181, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24183 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24184 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24185 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24186 = "torch.aten.slice.Tensor"(%24181, %24182, %24183, %24184, %24185) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24186, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24187 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24188 = "torch.aten.unsqueeze"(%24177, %24187) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24188, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24189 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24190 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24191 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24193 = "torch.aten.slice.Tensor"(%24188, %24189, %24190, %24191, %24192) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24193, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24194 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24195 = "torch.aten.unsqueeze"(%24193, %24194) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24195, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24196 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24197 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24198 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24199 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24200 = "torch.aten.slice.Tensor"(%24195, %24196, %24197, %24198, %24199) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24200, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24201 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24202 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24203 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24204 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24205 = "torch.prim.ListConstruct"(%24201, %24202, %24203, %24204) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24206 = "torch.aten.repeat"(%24200, %24205) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24206, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24207 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24208 = "torch.aten.unsqueeze"(%24186, %24207) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24208, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24210 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24211 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24212 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24213 = "torch.aten.slice.Tensor"(%24208, %24209, %24210, %24211, %24212) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24213, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24214 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24215 = "torch.aten.unsqueeze"(%24213, %24214) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24215, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24216 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24217 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24218 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24219 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24220 = "torch.aten.slice.Tensor"(%24215, %24216, %24217, %24218, %24219) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24220, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24221 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24222 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24223 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24224 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24225 = "torch.prim.ListConstruct"(%24221, %24222, %24223, %24224) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24226 = "torch.aten.repeat"(%24220, %24225) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24226, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24227 = "torch.aten.mul.Tensor"(%23939, %24206) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24227, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24228 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24229 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24230 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24232 = "torch.aten.slice.Tensor"(%23939, %24228, %24229, %24230, %24231) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24232, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24233 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24234 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24235 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24236 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24237 = "torch.aten.slice.Tensor"(%23939, %24233, %24234, %24235, %24236) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24237, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24238 = "torch.aten.neg"(%24237) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24238, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24239 = "torch.prim.ListConstruct"(%24238, %24232) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %24240 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24241 = "torch.aten.cat"(%24239, %24240) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24241, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24242 = "torch.aten.mul.Tensor"(%24241, %24226) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24242, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24243 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24244 = "torch.aten.add.Tensor"(%24227, %24242, %24243) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24244, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24245 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24246 = "torch.aten.mul.Scalar"(%arg69, %24245) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%24246, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %24247 = "torch.constant.int"() <{value = 16 : i64}> : () -> !torch.int
    %24248 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24249 = "torch.aten.add.Scalar"(%24246, %24247, %24248) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%24249, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %24250 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24251 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24252 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24253 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24254 = "torch.prim.ListConstruct"(%24250, %18477, %24251, %24252, %24253) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24255 = "torch.aten.view"(%24244, %24254) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24255, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24256 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24257 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24258 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24259 = "torch.prim.ListConstruct"(%19011, %24256, %24257, %24258) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24260 = "torch.aten.view"(%24255, %24259) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24260, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24261 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %24262 = "torch.aten.view"(%24249, %24261) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%24262, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %24263 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24264 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24265 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24266 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24267 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24268 = "torch.prim.ListConstruct"(%18479, %24263, %24264, %24265, %24266, %24267) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24269 = "torch.aten.view"(%23671, %24268) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24269, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24270 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24271 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24272 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24273 = "torch.prim.ListConstruct"(%18993, %24270, %24271, %24272) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24274 = "torch.aten.view"(%24269, %24273) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24274, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24275 = "torch.prim.ListConstruct"(%24262) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %24276 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24277 = "torch.aten.index_put"(%24274, %24275, %24260, %24276) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24277, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24278 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24279 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24280 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24281 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24282 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24283 = "torch.prim.ListConstruct"(%18479, %24278, %24279, %24280, %24281, %24282) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24284 = "torch.aten.view"(%24277, %24283) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24284, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24285 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %24286 = "torch.prim.ListConstruct"(%18479, %24285) : (!torch.int, !torch.int) -> !torch.list<int>
    %24287 = "torch.aten.view"(%24284, %24286) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24287, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %24288 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24289 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24290 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24291 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24292 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24293 = "torch.prim.ListConstruct"(%18479, %24288, %24289, %24290, %24291, %24292) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24294 = "torch.aten.view"(%24287, %24293) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24294, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24295 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24296 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24297 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24298 = "torch.prim.ListConstruct"(%18993, %24295, %24296, %24297) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24299 = "torch.aten.view"(%24294, %24298) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24299, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24300 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24301 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24302 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24303 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24304 = "torch.prim.ListConstruct"(%24300, %18477, %24301, %24302, %24303) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24305 = "torch.aten.view"(%23944, %24304) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24305, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24306 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24307 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24308 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24309 = "torch.prim.ListConstruct"(%19011, %24306, %24307, %24308) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24310 = "torch.aten.view"(%24305, %24309) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24310, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24311 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24312 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24313 = "torch.aten.add.Scalar"(%24249, %24311, %24312) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%24313, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %24314 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %24315 = "torch.aten.view"(%24313, %24314) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%24315, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %24316 = "torch.prim.ListConstruct"(%24315) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %24317 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24318 = "torch.aten.index_put"(%24299, %24316, %24310, %24317) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24318, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24319 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24320 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24321 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24322 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24323 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24324 = "torch.prim.ListConstruct"(%18479, %24319, %24320, %24321, %24322, %24323) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24325 = "torch.aten.view"(%24318, %24324) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24325, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24326 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %24327 = "torch.prim.ListConstruct"(%18479, %24326) : (!torch.int, !torch.int) -> !torch.list<int>
    %24328 = "torch.aten.view"(%24325, %24327) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24328, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %24329 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %24330 = "torch.aten.unsqueeze"(%24244, %24329) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24330, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24331 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24332 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24333 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24334 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24335 = "torch.prim.ListConstruct"(%24331, %18481, %24332, %24333, %24334) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24336 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24337 = "torch.aten.expand"(%24330, %24335, %24336) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24337, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24338 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24339 = "torch.aten.clone"(%24337, %24338) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24339, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24340 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24341 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24342 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24343 = "torch.prim.ListConstruct"(%24340, %18481, %24341, %24342) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24344 = "torch.aten._unsafe_view"(%24339, %24343) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24344, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24345 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %24346 = "torch.aten.unsqueeze"(%23944, %24345) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24346, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24347 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24348 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24349 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24350 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24351 = "torch.prim.ListConstruct"(%24347, %18481, %24348, %24349, %24350) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24352 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24353 = "torch.aten.expand"(%24346, %24351, %24352) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24353, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24354 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24355 = "torch.aten.clone"(%24353, %24354) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24355, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24356 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24357 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24358 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24359 = "torch.prim.ListConstruct"(%24356, %18481, %24357, %24358) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24360 = "torch.aten._unsafe_view"(%24355, %24359) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24360, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24361 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24362 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24363 = "torch.aten.transpose.int"(%24094, %24361, %24362) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24363, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24364 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24365 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24366 = "torch.aten.transpose.int"(%24344, %24364, %24365) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24366, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24367 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24368 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24369 = "torch.aten.transpose.int"(%24360, %24367, %24368) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24369, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24370 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24371 = "torch.aten.squeeze.dim"(%18570, %24370) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24371, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %24372 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24373 = "torch.aten.squeeze.dim"(%24371, %24372) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24373, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %24374 = "torch_c.to_builtin_tensor"(%24363) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %24375 = "torch_c.to_builtin_tensor"(%24366) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %24376 = "torch_c.to_builtin_tensor"(%24369) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %24377 = "torch_c.to_builtin_tensor"(%24373) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %24378 = "tensor.cast"(%24377) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %24379 = "torch_c.to_builtin_tensor"(%17529) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %24380 = "util.call"(%24374, %24375, %24376, %24379, %24378) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %24381 = "torch_c.from_builtin_tensor"(%24380) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%24381, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %24382 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24383 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24384 = "torch.aten.transpose.int"(%24381, %24382, %24383) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%24384, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %24385 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24386 = "torch.aten.clone"(%24384, %24385) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%24386, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %24387 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24388 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24389 = "torch.prim.ListConstruct"(%24387, %18481, %24388) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24390 = "torch.aten._unsafe_view"(%24386, %24389) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24390, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24391 = "torch.aten.div.Tensor"(%24390, %17531) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24391, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24392 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24393 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24394 = "torch.aten.clamp"(%24391, %24392, %24393) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24394, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24395 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24396 = "torch.prims.convert_element_type"(%24394, %24395) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24396, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %24397 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24398 = "torch.aten.unsqueeze"(%17533, %24397) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %24399 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24400 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24401 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24402 = "torch.prim.ListConstruct"(%24399, %24400, %24401) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24403 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24404 = "torch.aten.expand"(%24398, %24402, %24403) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %24405 = "torch_c.to_builtin_tensor"(%24396) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %24406 = "torch_c.to_builtin_tensor"(%24404) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %24407 = "util.call"(%24405, %24406) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %24408 = "torch_c.from_builtin_tensor"(%24407) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24408, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24409 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24410 = "torch.prims.convert_element_type"(%24408, %24409) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24410, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24411 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24412 = "torch.aten.add.Tensor"(%23838, %24410, %24411) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24412, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24413 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24414 = "torch.prims.convert_element_type"(%24412, %24413) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24414, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24415 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24416 = "torch.aten.pow.Tensor_Scalar"(%24414, %24415) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24416, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24417 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24418 = "torch.prim.ListConstruct"(%24417) : (!torch.int) -> !torch.list<int>
    %24419 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %24420 = "torch.constant.none"() : () -> !torch.none
    %24421 = "torch.aten.mean.dim"(%24416, %24418, %24419, %24420) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%24421, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %24422 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %24423 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24424 = "torch.aten.add.Scalar"(%24421, %24422, %24423) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%24424, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %24425 = "torch.aten.rsqrt"(%24424) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%24425, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %24426 = "torch.aten.mul.Tensor"(%24414, %24425) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24426, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24427 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24428 = "torch.prims.convert_element_type"(%24426, %24427) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24428, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24429 = "torch.aten.mul.Tensor"(%17535, %24428) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24429, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24430 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24431 = "torch.prims.convert_element_type"(%24429, %24430) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24431, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24432 = "torch.aten.div.Tensor"(%24431, %17537) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24432, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24433 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24434 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24435 = "torch.aten.clamp"(%24432, %24433, %24434) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24435, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24436 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24437 = "torch.prims.convert_element_type"(%24435, %24436) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24437, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %24438 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24439 = "torch.aten.unsqueeze"(%17539, %24438) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %24440 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24441 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %24442 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24443 = "torch.prim.ListConstruct"(%24440, %24441, %24442) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24444 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24445 = "torch.aten.expand"(%24439, %24443, %24444) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %24446 = "torch_c.to_builtin_tensor"(%24437) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %24447 = "torch_c.to_builtin_tensor"(%24445) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %24448 = "util.call"(%24446, %24447) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %24449 = "torch_c.from_builtin_tensor"(%24448) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%24449, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %24450 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24451 = "torch.prims.convert_element_type"(%24449, %24450) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%24451, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %24452 = "torch.aten.silu"(%24451) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%24452, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %24453 = "torch.aten.div.Tensor"(%24431, %17541) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24453, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24454 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24455 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24456 = "torch.aten.clamp"(%24453, %24454, %24455) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24456, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24457 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24458 = "torch.prims.convert_element_type"(%24456, %24457) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24458, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %24459 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24460 = "torch.aten.unsqueeze"(%17543, %24459) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %24461 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24462 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %24463 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24464 = "torch.prim.ListConstruct"(%24461, %24462, %24463) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24465 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24466 = "torch.aten.expand"(%24460, %24464, %24465) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %24467 = "torch_c.to_builtin_tensor"(%24458) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %24468 = "torch_c.to_builtin_tensor"(%24466) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %24469 = "util.call"(%24467, %24468) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %24470 = "torch_c.from_builtin_tensor"(%24469) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%24470, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %24471 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24472 = "torch.prims.convert_element_type"(%24470, %24471) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%24472, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %24473 = "torch.aten.mul.Tensor"(%24452, %24472) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%24473, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %24474 = "torch.aten.div.Tensor"(%24473, %17545) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%24474, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %24475 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24476 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24477 = "torch.aten.clamp"(%24474, %24475, %24476) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%24477, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %24478 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24479 = "torch.prims.convert_element_type"(%24477, %24478) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24479, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %24480 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24481 = "torch.aten.unsqueeze"(%17547, %24480) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %24482 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24483 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24484 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %24485 = "torch.prim.ListConstruct"(%24482, %24483, %24484) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24486 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24487 = "torch.aten.expand"(%24481, %24485, %24486) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %24488 = "torch_c.to_builtin_tensor"(%24479) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %24489 = "torch_c.to_builtin_tensor"(%24487) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %24490 = "util.call"(%24488, %24489) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %24491 = "torch_c.from_builtin_tensor"(%24490) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24491, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24492 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24493 = "torch.prims.convert_element_type"(%24491, %24492) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24493, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24494 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24495 = "torch.aten.add.Tensor"(%24412, %24493, %24494) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24495, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24496 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24497 = "torch.prims.convert_element_type"(%24495, %24496) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24497, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24498 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24499 = "torch.aten.pow.Tensor_Scalar"(%24497, %24498) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24499, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24500 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24501 = "torch.prim.ListConstruct"(%24500) : (!torch.int) -> !torch.list<int>
    %24502 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %24503 = "torch.constant.none"() : () -> !torch.none
    %24504 = "torch.aten.mean.dim"(%24499, %24501, %24502, %24503) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%24504, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %24505 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %24506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24507 = "torch.aten.add.Scalar"(%24504, %24505, %24506) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%24507, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %24508 = "torch.aten.rsqrt"(%24507) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%24508, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %24509 = "torch.aten.mul.Tensor"(%24497, %24508) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24509, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24510 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24511 = "torch.prims.convert_element_type"(%24509, %24510) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24511, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24512 = "torch.aten.mul.Tensor"(%17549, %24511) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24512, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24513 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24514 = "torch.prims.convert_element_type"(%24512, %24513) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24514, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24515 = "torch.aten.div.Tensor"(%24514, %17551) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24515, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24516 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24517 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24518 = "torch.aten.clamp"(%24515, %24516, %24517) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24518, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24519 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24520 = "torch.prims.convert_element_type"(%24518, %24519) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24520, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %24521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24522 = "torch.aten.unsqueeze"(%17553, %24521) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %24523 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24524 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24525 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24526 = "torch.prim.ListConstruct"(%24523, %24524, %24525) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24527 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24528 = "torch.aten.expand"(%24522, %24526, %24527) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %24529 = "torch_c.to_builtin_tensor"(%24520) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %24530 = "torch_c.to_builtin_tensor"(%24528) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %24531 = "util.call"(%24529, %24530) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %24532 = "torch_c.from_builtin_tensor"(%24531) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24532, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24533 = "torch.aten.div.Tensor"(%24532, %17555) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24533, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24534 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24535 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24536 = "torch.aten.clamp"(%24533, %24534, %24535) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%24536, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %24537 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24538 = "torch.prims.convert_element_type"(%24536, %24537) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24538, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %24539 = "torch.aten.div.Tensor"(%24514, %17557) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24539, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24540 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24541 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24542 = "torch.aten.clamp"(%24539, %24540, %24541) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24542, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24543 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24544 = "torch.prims.convert_element_type"(%24542, %24543) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24544, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %24545 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24546 = "torch.aten.unsqueeze"(%17559, %24545) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %24547 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24548 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %24549 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24550 = "torch.prim.ListConstruct"(%24547, %24548, %24549) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24551 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24552 = "torch.aten.expand"(%24546, %24550, %24551) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %24553 = "torch_c.to_builtin_tensor"(%24544) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %24554 = "torch_c.to_builtin_tensor"(%24552) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %24555 = "util.call"(%24553, %24554) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %24556 = "torch_c.from_builtin_tensor"(%24555) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%24556, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %24557 = "torch.aten.div.Tensor"(%24556, %17561) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%24557, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %24558 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24559 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24560 = "torch.aten.clamp"(%24557, %24558, %24559) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%24560, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %24561 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24562 = "torch.prims.convert_element_type"(%24560, %24561) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24562, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %24563 = "torch.aten.div.Tensor"(%24514, %17563) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24563, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24564 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24565 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24566 = "torch.aten.clamp"(%24563, %24564, %24565) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%24566, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %24567 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24568 = "torch.prims.convert_element_type"(%24566, %24567) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24568, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %24569 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24570 = "torch.aten.unsqueeze"(%17565, %24569) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %24571 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24572 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %24573 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %24574 = "torch.prim.ListConstruct"(%24571, %24572, %24573) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24575 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24576 = "torch.aten.expand"(%24570, %24574, %24575) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %24577 = "torch_c.to_builtin_tensor"(%24568) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %24578 = "torch_c.to_builtin_tensor"(%24576) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %24579 = "util.call"(%24577, %24578) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %24580 = "torch_c.from_builtin_tensor"(%24579) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%24580, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %24581 = "torch.aten.div.Tensor"(%24580, %17567) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%24581, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %24582 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %24583 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %24584 = "torch.aten.clamp"(%24581, %24582, %24583) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%24584, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %24585 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %24586 = "torch.prims.convert_element_type"(%24584, %24585) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24586, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %24587 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24588 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24589 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24590 = "torch.prim.ListConstruct"(%24587, %18481, %24588, %24589) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24591 = "torch.aten.view"(%24538, %24590) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24591, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24592 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24593 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24594 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24595 = "torch.prim.ListConstruct"(%24592, %18481, %24593, %24594) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24596 = "torch.aten.view"(%24562, %24595) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24596, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24597 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24598 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24599 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24600 = "torch.prim.ListConstruct"(%24597, %18481, %24598, %24599) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24601 = "torch.aten.view"(%24586, %24600) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24601, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24602 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %24603 = "torch.constant.none"() : () -> !torch.none
    %24604 = "torch.constant.none"() : () -> !torch.none
    %24605 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %24606 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24607 = "torch.aten.arange"(%24602, %24603, %24604, %24605, %24606) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %24608 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24609 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24610 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24611 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24612 = "torch.constant.none"() : () -> !torch.none
    %24613 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %24614 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24615 = "torch.aten.arange.start_step"(%24608, %24609, %24610, %24611, %24612, %24613, %24614) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %24616 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24617 = "torch.prims.convert_element_type"(%24615, %24616) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %24618 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24619 = "torch.aten.div.Scalar"(%24617, %24618) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24620 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %24621 = "torch.aten.pow.Scalar"(%24620, %24619) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24622 = "torch.aten.reciprocal"(%24621) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24623 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %24624 = "torch.aten.mul.Scalar"(%24622, %24623) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %24625 = "torch.aten.reciprocal"(%24624) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24626 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %24627 = "torch.aten.mul.Scalar"(%24625, %24626) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %24628 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %24629 = "torch.aten.gt.Scalar"(%24627, %24628) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24630 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24631 = "torch.aten.div.Scalar"(%24624, %24630) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24632 = "torch.aten.where.self"(%24629, %24631, %24624) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24633 = "torch.aten.reciprocal"(%24627) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24634 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %24635 = "torch.aten.mul.Scalar"(%24633, %24634) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24636 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24637 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24638 = "torch.aten.sub.Scalar"(%24635, %24636, %24637) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %24639 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24640 = "torch.aten.div.Scalar"(%24638, %24639) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24641 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24642 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24643 = "torch.aten.rsub.Scalar"(%24640, %24641, %24642) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %24644 = "torch.aten.mul.Tensor"(%24643, %24632) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24645 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24646 = "torch.aten.div.Scalar"(%24644, %24645) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24647 = "torch.aten.mul.Tensor"(%24640, %24632) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24648 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24649 = "torch.aten.add.Tensor"(%24646, %24647, %24648) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24650 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %24651 = "torch.aten.lt.Scalar"(%24627, %24650) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24652 = "torch.aten.bitwise_not"(%24651) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24653 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %24654 = "torch.aten.gt.Scalar"(%24627, %24653) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24655 = "torch.aten.bitwise_not"(%24654) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24656 = "torch.aten.mul.Tensor"(%24652, %24655) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24657 = "torch.aten.where.self"(%24656, %24649, %24632) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24658 = "torch.prim.ListConstruct"(%24657, %24657) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %24659 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24660 = "torch.aten.cat"(%24658, %24659) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %24661 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24662 = "torch.prims.convert_element_type"(%24607, %24661) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %24663 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24664 = "torch.prims.convert_element_type"(%24660, %24663) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %24665 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %24666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24667 = "torch.prim.ListConstruct"(%24665, %24666) : (!torch.int, !torch.int) -> !torch.list<int>
    %24668 = "torch.aten.view"(%24662, %24667) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %24669 = "torch.aten.mul.Tensor"(%24668, %24664) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24670 = "torch.aten.cos"(%24669) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24671 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24672 = "torch.prims.convert_element_type"(%24670, %24671) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24673 = "torch.aten.sin"(%24669) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24674 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24675 = "torch.prims.convert_element_type"(%24673, %24674) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24676 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24677 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24678 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24679 = "torch.aten.slice.Tensor"(%24672, %24676, %24677, %18481, %24678) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24679, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24680 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24681 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24682 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24683 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24684 = "torch.aten.slice.Tensor"(%24679, %24680, %24681, %24682, %24683) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24684, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24685 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24686 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24687 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24688 = "torch.aten.slice.Tensor"(%24675, %24685, %24686, %18481, %24687) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24688, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24689 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24690 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24691 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24692 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24693 = "torch.aten.slice.Tensor"(%24688, %24689, %24690, %24691, %24692) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24693, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24694 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24695 = "torch.aten.unsqueeze"(%24684, %24694) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24695, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24696 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24697 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24698 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24699 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24700 = "torch.aten.slice.Tensor"(%24695, %24696, %24697, %24698, %24699) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24700, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24701 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24702 = "torch.aten.unsqueeze"(%24700, %24701) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24702, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24703 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24704 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24705 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24706 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24707 = "torch.aten.slice.Tensor"(%24702, %24703, %24704, %24705, %24706) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24707, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24708 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24709 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24710 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24711 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24712 = "torch.prim.ListConstruct"(%24708, %24709, %24710, %24711) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24713 = "torch.aten.repeat"(%24707, %24712) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24713, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24714 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24715 = "torch.aten.unsqueeze"(%24693, %24714) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24715, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24716 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24717 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24718 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24719 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24720 = "torch.aten.slice.Tensor"(%24715, %24716, %24717, %24718, %24719) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24720, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24721 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24722 = "torch.aten.unsqueeze"(%24720, %24721) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24722, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24723 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24724 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24725 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24726 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24727 = "torch.aten.slice.Tensor"(%24722, %24723, %24724, %24725, %24726) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24727, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24728 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24729 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24730 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24731 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24732 = "torch.prim.ListConstruct"(%24728, %24729, %24730, %24731) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24733 = "torch.aten.repeat"(%24727, %24732) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24733, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24734 = "torch.aten.mul.Tensor"(%24591, %24713) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24734, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24735 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24736 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24737 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24738 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24739 = "torch.aten.slice.Tensor"(%24591, %24735, %24736, %24737, %24738) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24739, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24740 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24741 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24742 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24743 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24744 = "torch.aten.slice.Tensor"(%24591, %24740, %24741, %24742, %24743) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24744, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24745 = "torch.aten.neg"(%24744) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24745, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24746 = "torch.prim.ListConstruct"(%24745, %24739) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %24747 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24748 = "torch.aten.cat"(%24746, %24747) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24748, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24749 = "torch.aten.mul.Tensor"(%24748, %24733) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24749, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24751 = "torch.aten.add.Tensor"(%24734, %24749, %24750) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24751, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24752 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %24753 = "torch.constant.none"() : () -> !torch.none
    %24754 = "torch.constant.none"() : () -> !torch.none
    %24755 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %24756 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24757 = "torch.aten.arange"(%24752, %24753, %24754, %24755, %24756) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %24758 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24759 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24760 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24761 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24762 = "torch.constant.none"() : () -> !torch.none
    %24763 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %24764 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24765 = "torch.aten.arange.start_step"(%24758, %24759, %24760, %24761, %24762, %24763, %24764) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %24766 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24767 = "torch.prims.convert_element_type"(%24765, %24766) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %24768 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24769 = "torch.aten.div.Scalar"(%24767, %24768) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24770 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %24771 = "torch.aten.pow.Scalar"(%24770, %24769) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24772 = "torch.aten.reciprocal"(%24771) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24773 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %24774 = "torch.aten.mul.Scalar"(%24772, %24773) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %24775 = "torch.aten.reciprocal"(%24774) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24776 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %24777 = "torch.aten.mul.Scalar"(%24775, %24776) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %24778 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %24779 = "torch.aten.gt.Scalar"(%24777, %24778) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24780 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24781 = "torch.aten.div.Scalar"(%24774, %24780) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24782 = "torch.aten.where.self"(%24779, %24781, %24774) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24783 = "torch.aten.reciprocal"(%24777) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24784 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %24785 = "torch.aten.mul.Scalar"(%24783, %24784) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24786 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24787 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24788 = "torch.aten.sub.Scalar"(%24785, %24786, %24787) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %24789 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24790 = "torch.aten.div.Scalar"(%24788, %24789) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24792 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24793 = "torch.aten.rsub.Scalar"(%24790, %24791, %24792) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %24794 = "torch.aten.mul.Tensor"(%24793, %24782) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24795 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24796 = "torch.aten.div.Scalar"(%24794, %24795) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24797 = "torch.aten.mul.Tensor"(%24790, %24782) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24798 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24799 = "torch.aten.add.Tensor"(%24796, %24797, %24798) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %24800 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %24801 = "torch.aten.lt.Scalar"(%24777, %24800) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24802 = "torch.aten.bitwise_not"(%24801) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24803 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %24804 = "torch.aten.gt.Scalar"(%24777, %24803) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %24805 = "torch.aten.bitwise_not"(%24804) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24806 = "torch.aten.mul.Tensor"(%24802, %24805) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %24807 = "torch.aten.where.self"(%24806, %24799, %24782) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %24808 = "torch.prim.ListConstruct"(%24807, %24807) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %24809 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24810 = "torch.aten.cat"(%24808, %24809) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %24811 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24812 = "torch.prims.convert_element_type"(%24757, %24811) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %24813 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %24814 = "torch.prims.convert_element_type"(%24810, %24813) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %24815 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %24816 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24817 = "torch.prim.ListConstruct"(%24815, %24816) : (!torch.int, !torch.int) -> !torch.list<int>
    %24818 = "torch.aten.view"(%24812, %24817) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %24819 = "torch.aten.mul.Tensor"(%24818, %24814) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24820 = "torch.aten.cos"(%24819) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24821 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24822 = "torch.prims.convert_element_type"(%24820, %24821) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24823 = "torch.aten.sin"(%24819) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %24824 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %24825 = "torch.prims.convert_element_type"(%24823, %24824) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %24826 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24827 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24828 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24829 = "torch.aten.slice.Tensor"(%24822, %24826, %24827, %18481, %24828) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24829, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24830 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24831 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24832 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24833 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24834 = "torch.aten.slice.Tensor"(%24829, %24830, %24831, %24832, %24833) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24834, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24835 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24836 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24837 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24838 = "torch.aten.slice.Tensor"(%24825, %24835, %24836, %18481, %24837) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24838, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24839 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24840 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24841 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24842 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24843 = "torch.aten.slice.Tensor"(%24838, %24839, %24840, %24841, %24842) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%24843, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %24844 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24845 = "torch.aten.unsqueeze"(%24834, %24844) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24845, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24846 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24847 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24848 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24849 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24850 = "torch.aten.slice.Tensor"(%24845, %24846, %24847, %24848, %24849) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24850, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24851 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24852 = "torch.aten.unsqueeze"(%24850, %24851) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24852, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24853 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24854 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24855 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24857 = "torch.aten.slice.Tensor"(%24852, %24853, %24854, %24855, %24856) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24857, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24858 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24859 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24860 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24861 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24862 = "torch.prim.ListConstruct"(%24858, %24859, %24860, %24861) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24863 = "torch.aten.repeat"(%24857, %24862) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24863, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24864 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24865 = "torch.aten.unsqueeze"(%24843, %24864) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24865, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24867 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24868 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24869 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24870 = "torch.aten.slice.Tensor"(%24865, %24866, %24867, %24868, %24869) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%24870, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %24871 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24872 = "torch.aten.unsqueeze"(%24870, %24871) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24872, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24873 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24874 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24875 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24876 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24877 = "torch.aten.slice.Tensor"(%24872, %24873, %24874, %24875, %24876) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24877, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %24878 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24880 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24881 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24882 = "torch.prim.ListConstruct"(%24878, %24879, %24880, %24881) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24883 = "torch.aten.repeat"(%24877, %24882) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%24883, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %24884 = "torch.aten.mul.Tensor"(%24596, %24863) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24884, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24885 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24886 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24887 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24889 = "torch.aten.slice.Tensor"(%24596, %24885, %24886, %24887, %24888) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24889, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24890 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %24891 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24892 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %24893 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24894 = "torch.aten.slice.Tensor"(%24596, %24890, %24891, %24892, %24893) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24894, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24895 = "torch.aten.neg"(%24894) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24895, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %24896 = "torch.prim.ListConstruct"(%24895, %24889) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %24897 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %24898 = "torch.aten.cat"(%24896, %24897) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24898, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24899 = "torch.aten.mul.Tensor"(%24898, %24883) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24899, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24900 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24901 = "torch.aten.add.Tensor"(%24884, %24899, %24900) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24901, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24902 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %24903 = "torch.aten.mul.Scalar"(%arg69, %24902) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%24903, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %24904 = "torch.constant.int"() <{value = 18 : i64}> : () -> !torch.int
    %24905 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24906 = "torch.aten.add.Scalar"(%24903, %24904, %24905) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%24906, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %24907 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24908 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24909 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24910 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24911 = "torch.prim.ListConstruct"(%24907, %18477, %24908, %24909, %24910) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24912 = "torch.aten.view"(%24901, %24911) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24912, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24913 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24914 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24915 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24916 = "torch.prim.ListConstruct"(%19011, %24913, %24914, %24915) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24917 = "torch.aten.view"(%24912, %24916) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24917, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24918 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %24919 = "torch.aten.view"(%24906, %24918) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%24919, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %24920 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24921 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24922 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24923 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24924 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24925 = "torch.prim.ListConstruct"(%18479, %24920, %24921, %24922, %24923, %24924) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24926 = "torch.aten.view"(%24328, %24925) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24926, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24927 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24928 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24929 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24930 = "torch.prim.ListConstruct"(%18993, %24927, %24928, %24929) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24931 = "torch.aten.view"(%24926, %24930) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24931, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24932 = "torch.prim.ListConstruct"(%24919) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %24933 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24934 = "torch.aten.index_put"(%24931, %24932, %24917, %24933) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24934, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24935 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24936 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24937 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24938 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24939 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24940 = "torch.prim.ListConstruct"(%18479, %24935, %24936, %24937, %24938, %24939) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24941 = "torch.aten.view"(%24934, %24940) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24941, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24942 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %24943 = "torch.prim.ListConstruct"(%18479, %24942) : (!torch.int, !torch.int) -> !torch.list<int>
    %24944 = "torch.aten.view"(%24941, %24943) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24944, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %24945 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24946 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24947 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24948 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24949 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24950 = "torch.prim.ListConstruct"(%18479, %24945, %24946, %24947, %24948, %24949) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24951 = "torch.aten.view"(%24944, %24950) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24951, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24952 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24953 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24954 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24955 = "torch.prim.ListConstruct"(%18993, %24952, %24953, %24954) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24956 = "torch.aten.view"(%24951, %24955) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24956, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24957 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24958 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24959 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24960 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24961 = "torch.prim.ListConstruct"(%24957, %18477, %24958, %24959, %24960) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24962 = "torch.aten.view"(%24601, %24961) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24962, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24963 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24964 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24965 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24966 = "torch.prim.ListConstruct"(%19011, %24963, %24964, %24965) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24967 = "torch.aten.view"(%24962, %24966) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24967, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24968 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24969 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %24970 = "torch.aten.add.Scalar"(%24906, %24968, %24969) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%24970, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %24971 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %24972 = "torch.aten.view"(%24970, %24971) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%24972, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %24973 = "torch.prim.ListConstruct"(%24972) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %24974 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24975 = "torch.aten.index_put"(%24956, %24973, %24967, %24974) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24975, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24976 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24977 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %24978 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24979 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24980 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24981 = "torch.prim.ListConstruct"(%18479, %24976, %24977, %24978, %24979, %24980) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24982 = "torch.aten.view"(%24975, %24981) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24982, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24983 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %24984 = "torch.prim.ListConstruct"(%18479, %24983) : (!torch.int, !torch.int) -> !torch.list<int>
    %24985 = "torch.aten.view"(%24982, %24984) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24985, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %24986 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %24987 = "torch.aten.unsqueeze"(%24901, %24986) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24987, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24988 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24989 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %24990 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24991 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %24992 = "torch.prim.ListConstruct"(%24988, %18481, %24989, %24990, %24991) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %24993 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %24994 = "torch.aten.expand"(%24987, %24992, %24993) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24994, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24995 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %24996 = "torch.aten.clone"(%24994, %24995) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%24996, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %24997 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %24998 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %24999 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25000 = "torch.prim.ListConstruct"(%24997, %18481, %24998, %24999) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25001 = "torch.aten._unsafe_view"(%24996, %25000) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25001, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25002 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %25003 = "torch.aten.unsqueeze"(%24601, %25002) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25003, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25004 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25005 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25006 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25007 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25008 = "torch.prim.ListConstruct"(%25004, %18481, %25005, %25006, %25007) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25009 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25010 = "torch.aten.expand"(%25003, %25008, %25009) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25010, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25011 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25012 = "torch.aten.clone"(%25010, %25011) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25012, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25013 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25014 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25015 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25016 = "torch.prim.ListConstruct"(%25013, %18481, %25014, %25015) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25017 = "torch.aten._unsafe_view"(%25012, %25016) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25017, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25018 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25019 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25020 = "torch.aten.transpose.int"(%24751, %25018, %25019) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25020, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25021 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25022 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25023 = "torch.aten.transpose.int"(%25001, %25021, %25022) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25023, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25024 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25025 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25026 = "torch.aten.transpose.int"(%25017, %25024, %25025) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25026, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25027 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25028 = "torch.aten.squeeze.dim"(%18570, %25027) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25028, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %25029 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25030 = "torch.aten.squeeze.dim"(%25028, %25029) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25030, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %25031 = "torch_c.to_builtin_tensor"(%25020) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %25032 = "torch_c.to_builtin_tensor"(%25023) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %25033 = "torch_c.to_builtin_tensor"(%25026) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %25034 = "torch_c.to_builtin_tensor"(%25030) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %25035 = "tensor.cast"(%25034) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %25036 = "torch_c.to_builtin_tensor"(%17569) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %25037 = "util.call"(%25031, %25032, %25033, %25036, %25035) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %25038 = "torch_c.from_builtin_tensor"(%25037) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%25038, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %25039 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25040 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25041 = "torch.aten.transpose.int"(%25038, %25039, %25040) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%25041, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %25042 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25043 = "torch.aten.clone"(%25041, %25042) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%25043, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %25044 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25045 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25046 = "torch.prim.ListConstruct"(%25044, %18481, %25045) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25047 = "torch.aten._unsafe_view"(%25043, %25046) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25047, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25048 = "torch.aten.div.Tensor"(%25047, %17571) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25048, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25049 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25050 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25051 = "torch.aten.clamp"(%25048, %25049, %25050) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25051, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25052 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25053 = "torch.prims.convert_element_type"(%25051, %25052) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25053, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25054 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25055 = "torch.aten.unsqueeze"(%17573, %25054) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %25056 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25057 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25058 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25059 = "torch.prim.ListConstruct"(%25056, %25057, %25058) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25060 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25061 = "torch.aten.expand"(%25055, %25059, %25060) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %25062 = "torch_c.to_builtin_tensor"(%25053) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25063 = "torch_c.to_builtin_tensor"(%25061) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %25064 = "util.call"(%25062, %25063) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %25065 = "torch_c.from_builtin_tensor"(%25064) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25065, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25066 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25067 = "torch.prims.convert_element_type"(%25065, %25066) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25067, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25068 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25069 = "torch.aten.add.Tensor"(%24495, %25067, %25068) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25069, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25070 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25071 = "torch.prims.convert_element_type"(%25069, %25070) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25071, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25072 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25073 = "torch.aten.pow.Tensor_Scalar"(%25071, %25072) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25073, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25074 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25075 = "torch.prim.ListConstruct"(%25074) : (!torch.int) -> !torch.list<int>
    %25076 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %25077 = "torch.constant.none"() : () -> !torch.none
    %25078 = "torch.aten.mean.dim"(%25073, %25075, %25076, %25077) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25078, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25079 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %25080 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25081 = "torch.aten.add.Scalar"(%25078, %25079, %25080) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25081, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25082 = "torch.aten.rsqrt"(%25081) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25082, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25083 = "torch.aten.mul.Tensor"(%25071, %25082) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25083, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25084 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25085 = "torch.prims.convert_element_type"(%25083, %25084) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25085, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25086 = "torch.aten.mul.Tensor"(%17575, %25085) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25086, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25087 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25088 = "torch.prims.convert_element_type"(%25086, %25087) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25088, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25089 = "torch.aten.div.Tensor"(%25088, %17577) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25089, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25090 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25091 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25092 = "torch.aten.clamp"(%25089, %25090, %25091) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25092, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25093 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25094 = "torch.prims.convert_element_type"(%25092, %25093) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25094, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25095 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25096 = "torch.aten.unsqueeze"(%17579, %25095) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %25097 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25098 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %25099 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25100 = "torch.prim.ListConstruct"(%25097, %25098, %25099) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25101 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25102 = "torch.aten.expand"(%25096, %25100, %25101) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %25103 = "torch_c.to_builtin_tensor"(%25094) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25104 = "torch_c.to_builtin_tensor"(%25102) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %25105 = "util.call"(%25103, %25104) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %25106 = "torch_c.from_builtin_tensor"(%25105) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%25106, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %25107 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25108 = "torch.prims.convert_element_type"(%25106, %25107) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25108, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25109 = "torch.aten.silu"(%25108) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25109, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25110 = "torch.aten.div.Tensor"(%25088, %17581) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25110, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25111 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25112 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25113 = "torch.aten.clamp"(%25110, %25111, %25112) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25113, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25114 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25115 = "torch.prims.convert_element_type"(%25113, %25114) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25115, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25116 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25117 = "torch.aten.unsqueeze"(%17583, %25116) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %25118 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25119 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %25120 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25121 = "torch.prim.ListConstruct"(%25118, %25119, %25120) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25122 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25123 = "torch.aten.expand"(%25117, %25121, %25122) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %25124 = "torch_c.to_builtin_tensor"(%25115) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25125 = "torch_c.to_builtin_tensor"(%25123) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %25126 = "util.call"(%25124, %25125) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %25127 = "torch_c.from_builtin_tensor"(%25126) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%25127, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %25128 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25129 = "torch.prims.convert_element_type"(%25127, %25128) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25129, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25130 = "torch.aten.mul.Tensor"(%25109, %25129) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25130, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25131 = "torch.aten.div.Tensor"(%25130, %17585) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25131, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25132 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25133 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25134 = "torch.aten.clamp"(%25131, %25132, %25133) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25134, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25135 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25136 = "torch.prims.convert_element_type"(%25134, %25135) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25136, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %25137 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25138 = "torch.aten.unsqueeze"(%17587, %25137) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %25139 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25140 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25141 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %25142 = "torch.prim.ListConstruct"(%25139, %25140, %25141) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25143 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25144 = "torch.aten.expand"(%25138, %25142, %25143) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %25145 = "torch_c.to_builtin_tensor"(%25136) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %25146 = "torch_c.to_builtin_tensor"(%25144) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %25147 = "util.call"(%25145, %25146) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %25148 = "torch_c.from_builtin_tensor"(%25147) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25148, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25149 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25150 = "torch.prims.convert_element_type"(%25148, %25149) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25150, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25151 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25152 = "torch.aten.add.Tensor"(%25069, %25150, %25151) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25152, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25153 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25154 = "torch.prims.convert_element_type"(%25152, %25153) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25154, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25155 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25156 = "torch.aten.pow.Tensor_Scalar"(%25154, %25155) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25156, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25157 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25158 = "torch.prim.ListConstruct"(%25157) : (!torch.int) -> !torch.list<int>
    %25159 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %25160 = "torch.constant.none"() : () -> !torch.none
    %25161 = "torch.aten.mean.dim"(%25156, %25158, %25159, %25160) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25161, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25162 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %25163 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25164 = "torch.aten.add.Scalar"(%25161, %25162, %25163) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25164, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25165 = "torch.aten.rsqrt"(%25164) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25165, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25166 = "torch.aten.mul.Tensor"(%25154, %25165) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25166, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25167 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25168 = "torch.prims.convert_element_type"(%25166, %25167) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25168, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25169 = "torch.aten.mul.Tensor"(%17589, %25168) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25169, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25170 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25171 = "torch.prims.convert_element_type"(%25169, %25170) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25171, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25172 = "torch.aten.div.Tensor"(%25171, %17591) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25172, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25173 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25174 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25175 = "torch.aten.clamp"(%25172, %25173, %25174) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25175, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25176 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25177 = "torch.prims.convert_element_type"(%25175, %25176) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25177, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25178 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25179 = "torch.aten.unsqueeze"(%17593, %25178) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %25180 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25181 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25182 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25183 = "torch.prim.ListConstruct"(%25180, %25181, %25182) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25184 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25185 = "torch.aten.expand"(%25179, %25183, %25184) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %25186 = "torch_c.to_builtin_tensor"(%25177) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25187 = "torch_c.to_builtin_tensor"(%25185) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %25188 = "util.call"(%25186, %25187) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %25189 = "torch_c.from_builtin_tensor"(%25188) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25189, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25190 = "torch.aten.div.Tensor"(%25189, %17595) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25190, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25191 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25192 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25193 = "torch.aten.clamp"(%25190, %25191, %25192) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25193, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25194 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25195 = "torch.prims.convert_element_type"(%25193, %25194) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25195, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25196 = "torch.aten.div.Tensor"(%25171, %17597) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25196, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25197 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25198 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25199 = "torch.aten.clamp"(%25196, %25197, %25198) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25199, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25200 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25201 = "torch.prims.convert_element_type"(%25199, %25200) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25201, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25202 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25203 = "torch.aten.unsqueeze"(%17599, %25202) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %25204 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25205 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %25206 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25207 = "torch.prim.ListConstruct"(%25204, %25205, %25206) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25208 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25209 = "torch.aten.expand"(%25203, %25207, %25208) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %25210 = "torch_c.to_builtin_tensor"(%25201) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25211 = "torch_c.to_builtin_tensor"(%25209) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %25212 = "util.call"(%25210, %25211) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %25213 = "torch_c.from_builtin_tensor"(%25212) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25213, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25214 = "torch.aten.div.Tensor"(%25213, %17601) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25214, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25215 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25216 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25217 = "torch.aten.clamp"(%25214, %25215, %25216) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25217, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25218 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25219 = "torch.prims.convert_element_type"(%25217, %25218) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25219, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %25220 = "torch.aten.div.Tensor"(%25171, %17603) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25220, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25221 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25222 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25223 = "torch.aten.clamp"(%25220, %25221, %25222) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25223, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25224 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25225 = "torch.prims.convert_element_type"(%25223, %25224) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25225, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25226 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25227 = "torch.aten.unsqueeze"(%17605, %25226) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %25228 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25229 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %25230 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25231 = "torch.prim.ListConstruct"(%25228, %25229, %25230) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25232 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25233 = "torch.aten.expand"(%25227, %25231, %25232) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %25234 = "torch_c.to_builtin_tensor"(%25225) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25235 = "torch_c.to_builtin_tensor"(%25233) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %25236 = "util.call"(%25234, %25235) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %25237 = "torch_c.from_builtin_tensor"(%25236) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25237, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25238 = "torch.aten.div.Tensor"(%25237, %17607) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25238, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25239 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25240 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25241 = "torch.aten.clamp"(%25238, %25239, %25240) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25241, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25242 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25243 = "torch.prims.convert_element_type"(%25241, %25242) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25243, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %25244 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25245 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25246 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25247 = "torch.prim.ListConstruct"(%25244, %18481, %25245, %25246) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25248 = "torch.aten.view"(%25195, %25247) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25248, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25249 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25250 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25251 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25252 = "torch.prim.ListConstruct"(%25249, %18481, %25250, %25251) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25253 = "torch.aten.view"(%25219, %25252) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25253, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25254 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25255 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25256 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25257 = "torch.prim.ListConstruct"(%25254, %18481, %25255, %25256) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25258 = "torch.aten.view"(%25243, %25257) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25258, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25259 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %25260 = "torch.constant.none"() : () -> !torch.none
    %25261 = "torch.constant.none"() : () -> !torch.none
    %25262 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %25263 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25264 = "torch.aten.arange"(%25259, %25260, %25261, %25262, %25263) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %25265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25266 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25267 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25268 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25269 = "torch.constant.none"() : () -> !torch.none
    %25270 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %25271 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25272 = "torch.aten.arange.start_step"(%25265, %25266, %25267, %25268, %25269, %25270, %25271) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %25273 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25274 = "torch.prims.convert_element_type"(%25272, %25273) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %25275 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25276 = "torch.aten.div.Scalar"(%25274, %25275) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25277 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %25278 = "torch.aten.pow.Scalar"(%25277, %25276) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25279 = "torch.aten.reciprocal"(%25278) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25280 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %25281 = "torch.aten.mul.Scalar"(%25279, %25280) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %25282 = "torch.aten.reciprocal"(%25281) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25283 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %25284 = "torch.aten.mul.Scalar"(%25282, %25283) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %25285 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %25286 = "torch.aten.gt.Scalar"(%25284, %25285) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25287 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25288 = "torch.aten.div.Scalar"(%25281, %25287) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25289 = "torch.aten.where.self"(%25286, %25288, %25281) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25290 = "torch.aten.reciprocal"(%25284) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25291 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %25292 = "torch.aten.mul.Scalar"(%25290, %25291) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25293 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25294 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25295 = "torch.aten.sub.Scalar"(%25292, %25293, %25294) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %25296 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25297 = "torch.aten.div.Scalar"(%25295, %25296) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25298 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25299 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25300 = "torch.aten.rsub.Scalar"(%25297, %25298, %25299) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %25301 = "torch.aten.mul.Tensor"(%25300, %25289) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25302 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25303 = "torch.aten.div.Scalar"(%25301, %25302) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25304 = "torch.aten.mul.Tensor"(%25297, %25289) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25305 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25306 = "torch.aten.add.Tensor"(%25303, %25304, %25305) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25307 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %25308 = "torch.aten.lt.Scalar"(%25284, %25307) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25309 = "torch.aten.bitwise_not"(%25308) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25310 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %25311 = "torch.aten.gt.Scalar"(%25284, %25310) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25312 = "torch.aten.bitwise_not"(%25311) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25313 = "torch.aten.mul.Tensor"(%25309, %25312) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25314 = "torch.aten.where.self"(%25313, %25306, %25289) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25315 = "torch.prim.ListConstruct"(%25314, %25314) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %25316 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25317 = "torch.aten.cat"(%25315, %25316) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %25318 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25319 = "torch.prims.convert_element_type"(%25264, %25318) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %25320 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25321 = "torch.prims.convert_element_type"(%25317, %25320) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %25322 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %25323 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25324 = "torch.prim.ListConstruct"(%25322, %25323) : (!torch.int, !torch.int) -> !torch.list<int>
    %25325 = "torch.aten.view"(%25319, %25324) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %25326 = "torch.aten.mul.Tensor"(%25325, %25321) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25327 = "torch.aten.cos"(%25326) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25328 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25329 = "torch.prims.convert_element_type"(%25327, %25328) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %25330 = "torch.aten.sin"(%25326) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25331 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25332 = "torch.prims.convert_element_type"(%25330, %25331) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %25333 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25334 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25335 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25336 = "torch.aten.slice.Tensor"(%25329, %25333, %25334, %18481, %25335) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25336, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25337 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25338 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25339 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25340 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25341 = "torch.aten.slice.Tensor"(%25336, %25337, %25338, %25339, %25340) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25341, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25342 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25343 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25344 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25345 = "torch.aten.slice.Tensor"(%25332, %25342, %25343, %18481, %25344) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25345, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25346 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25347 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25348 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25349 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25350 = "torch.aten.slice.Tensor"(%25345, %25346, %25347, %25348, %25349) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25350, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25351 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25352 = "torch.aten.unsqueeze"(%25341, %25351) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25352, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25353 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25354 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25355 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25356 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25357 = "torch.aten.slice.Tensor"(%25352, %25353, %25354, %25355, %25356) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25357, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25358 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25359 = "torch.aten.unsqueeze"(%25357, %25358) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25359, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25360 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25361 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25362 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25363 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25364 = "torch.aten.slice.Tensor"(%25359, %25360, %25361, %25362, %25363) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25364, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25365 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25366 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25367 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25368 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25369 = "torch.prim.ListConstruct"(%25365, %25366, %25367, %25368) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25370 = "torch.aten.repeat"(%25364, %25369) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25370, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %25371 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25372 = "torch.aten.unsqueeze"(%25350, %25371) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25372, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25373 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25374 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25375 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25376 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25377 = "torch.aten.slice.Tensor"(%25372, %25373, %25374, %25375, %25376) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25377, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25378 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25379 = "torch.aten.unsqueeze"(%25377, %25378) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25379, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25380 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25381 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25382 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25383 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25384 = "torch.aten.slice.Tensor"(%25379, %25380, %25381, %25382, %25383) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25384, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25385 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25386 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25387 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25388 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25389 = "torch.prim.ListConstruct"(%25385, %25386, %25387, %25388) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25390 = "torch.aten.repeat"(%25384, %25389) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25390, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %25391 = "torch.aten.mul.Tensor"(%25248, %25370) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25391, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25392 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25393 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25394 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %25395 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25396 = "torch.aten.slice.Tensor"(%25248, %25392, %25393, %25394, %25395) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25396, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %25397 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25398 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %25399 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25400 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25401 = "torch.aten.slice.Tensor"(%25248, %25397, %25398, %25399, %25400) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25401, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %25402 = "torch.aten.neg"(%25401) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25402, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %25403 = "torch.prim.ListConstruct"(%25402, %25396) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %25404 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25405 = "torch.aten.cat"(%25403, %25404) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25405, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25406 = "torch.aten.mul.Tensor"(%25405, %25390) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25406, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25408 = "torch.aten.add.Tensor"(%25391, %25406, %25407) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25408, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25409 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %25410 = "torch.constant.none"() : () -> !torch.none
    %25411 = "torch.constant.none"() : () -> !torch.none
    %25412 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %25413 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25414 = "torch.aten.arange"(%25409, %25410, %25411, %25412, %25413) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %25415 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25416 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25417 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25418 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25419 = "torch.constant.none"() : () -> !torch.none
    %25420 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %25421 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25422 = "torch.aten.arange.start_step"(%25415, %25416, %25417, %25418, %25419, %25420, %25421) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %25423 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25424 = "torch.prims.convert_element_type"(%25422, %25423) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %25425 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25426 = "torch.aten.div.Scalar"(%25424, %25425) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25427 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %25428 = "torch.aten.pow.Scalar"(%25427, %25426) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25429 = "torch.aten.reciprocal"(%25428) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25430 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %25431 = "torch.aten.mul.Scalar"(%25429, %25430) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %25432 = "torch.aten.reciprocal"(%25431) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25433 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %25434 = "torch.aten.mul.Scalar"(%25432, %25433) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %25435 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %25436 = "torch.aten.gt.Scalar"(%25434, %25435) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25437 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25438 = "torch.aten.div.Scalar"(%25431, %25437) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25439 = "torch.aten.where.self"(%25436, %25438, %25431) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25440 = "torch.aten.reciprocal"(%25434) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25441 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %25442 = "torch.aten.mul.Scalar"(%25440, %25441) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25443 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25444 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25445 = "torch.aten.sub.Scalar"(%25442, %25443, %25444) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %25446 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25447 = "torch.aten.div.Scalar"(%25445, %25446) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25448 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25449 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25450 = "torch.aten.rsub.Scalar"(%25447, %25448, %25449) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %25451 = "torch.aten.mul.Tensor"(%25450, %25439) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25452 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25453 = "torch.aten.div.Scalar"(%25451, %25452) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25454 = "torch.aten.mul.Tensor"(%25447, %25439) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25455 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25456 = "torch.aten.add.Tensor"(%25453, %25454, %25455) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25457 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %25458 = "torch.aten.lt.Scalar"(%25434, %25457) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25459 = "torch.aten.bitwise_not"(%25458) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25460 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %25461 = "torch.aten.gt.Scalar"(%25434, %25460) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25462 = "torch.aten.bitwise_not"(%25461) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25463 = "torch.aten.mul.Tensor"(%25459, %25462) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25464 = "torch.aten.where.self"(%25463, %25456, %25439) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25465 = "torch.prim.ListConstruct"(%25464, %25464) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %25466 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25467 = "torch.aten.cat"(%25465, %25466) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %25468 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25469 = "torch.prims.convert_element_type"(%25414, %25468) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %25470 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25471 = "torch.prims.convert_element_type"(%25467, %25470) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %25472 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %25473 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25474 = "torch.prim.ListConstruct"(%25472, %25473) : (!torch.int, !torch.int) -> !torch.list<int>
    %25475 = "torch.aten.view"(%25469, %25474) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %25476 = "torch.aten.mul.Tensor"(%25475, %25471) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25477 = "torch.aten.cos"(%25476) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25478 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25479 = "torch.prims.convert_element_type"(%25477, %25478) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %25480 = "torch.aten.sin"(%25476) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25481 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25482 = "torch.prims.convert_element_type"(%25480, %25481) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %25483 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25484 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25485 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25486 = "torch.aten.slice.Tensor"(%25479, %25483, %25484, %18481, %25485) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25486, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25487 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25488 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25489 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25490 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25491 = "torch.aten.slice.Tensor"(%25486, %25487, %25488, %25489, %25490) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25491, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25492 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25493 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25494 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25495 = "torch.aten.slice.Tensor"(%25482, %25492, %25493, %18481, %25494) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25495, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25496 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25497 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25498 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25499 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25500 = "torch.aten.slice.Tensor"(%25495, %25496, %25497, %25498, %25499) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25500, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25501 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25502 = "torch.aten.unsqueeze"(%25491, %25501) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25502, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25503 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25504 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25505 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25507 = "torch.aten.slice.Tensor"(%25502, %25503, %25504, %25505, %25506) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25507, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25508 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25509 = "torch.aten.unsqueeze"(%25507, %25508) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25509, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25510 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25511 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25512 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25513 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25514 = "torch.aten.slice.Tensor"(%25509, %25510, %25511, %25512, %25513) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25514, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25515 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25516 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25517 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25518 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25519 = "torch.prim.ListConstruct"(%25515, %25516, %25517, %25518) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25520 = "torch.aten.repeat"(%25514, %25519) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25520, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %25521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25522 = "torch.aten.unsqueeze"(%25500, %25521) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25522, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25523 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25524 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25525 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25526 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25527 = "torch.aten.slice.Tensor"(%25522, %25523, %25524, %25525, %25526) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%25527, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %25528 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25529 = "torch.aten.unsqueeze"(%25527, %25528) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25529, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25530 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25531 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25532 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25534 = "torch.aten.slice.Tensor"(%25529, %25530, %25531, %25532, %25533) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25534, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %25535 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25537 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25538 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25539 = "torch.prim.ListConstruct"(%25535, %25536, %25537, %25538) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25540 = "torch.aten.repeat"(%25534, %25539) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%25540, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %25541 = "torch.aten.mul.Tensor"(%25253, %25520) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25541, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25542 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25543 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25544 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %25545 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25546 = "torch.aten.slice.Tensor"(%25253, %25542, %25543, %25544, %25545) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25546, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %25547 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25548 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %25549 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25550 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25551 = "torch.aten.slice.Tensor"(%25253, %25547, %25548, %25549, %25550) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25551, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %25552 = "torch.aten.neg"(%25551) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25552, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %25553 = "torch.prim.ListConstruct"(%25552, %25546) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %25554 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25555 = "torch.aten.cat"(%25553, %25554) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25555, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25556 = "torch.aten.mul.Tensor"(%25555, %25540) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25556, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25557 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25558 = "torch.aten.add.Tensor"(%25541, %25556, %25557) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25558, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25559 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %25560 = "torch.aten.mul.Scalar"(%arg69, %25559) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%25560, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %25561 = "torch.constant.int"() <{value = 20 : i64}> : () -> !torch.int
    %25562 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25563 = "torch.aten.add.Scalar"(%25560, %25561, %25562) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%25563, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %25564 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25565 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25566 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25567 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25568 = "torch.prim.ListConstruct"(%25564, %18477, %25565, %25566, %25567) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25569 = "torch.aten.view"(%25558, %25568) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25569, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25570 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25571 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25572 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25573 = "torch.prim.ListConstruct"(%19011, %25570, %25571, %25572) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25574 = "torch.aten.view"(%25569, %25573) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25574, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25575 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %25576 = "torch.aten.view"(%25563, %25575) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%25576, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %25577 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25578 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25579 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25580 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25581 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25582 = "torch.prim.ListConstruct"(%18479, %25577, %25578, %25579, %25580, %25581) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25583 = "torch.aten.view"(%24985, %25582) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25583, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25584 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25585 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25586 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25587 = "torch.prim.ListConstruct"(%18993, %25584, %25585, %25586) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25588 = "torch.aten.view"(%25583, %25587) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25588, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25589 = "torch.prim.ListConstruct"(%25576) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %25590 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25591 = "torch.aten.index_put"(%25588, %25589, %25574, %25590) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25591, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25592 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25593 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25594 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25595 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25596 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25597 = "torch.prim.ListConstruct"(%18479, %25592, %25593, %25594, %25595, %25596) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25598 = "torch.aten.view"(%25591, %25597) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25598, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25599 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %25600 = "torch.prim.ListConstruct"(%18479, %25599) : (!torch.int, !torch.int) -> !torch.list<int>
    %25601 = "torch.aten.view"(%25598, %25600) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25601, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %25602 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25603 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25604 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25605 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25606 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25607 = "torch.prim.ListConstruct"(%18479, %25602, %25603, %25604, %25605, %25606) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25608 = "torch.aten.view"(%25601, %25607) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25608, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25609 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25610 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25611 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25612 = "torch.prim.ListConstruct"(%18993, %25609, %25610, %25611) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25613 = "torch.aten.view"(%25608, %25612) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25613, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25614 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25615 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25616 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25617 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25618 = "torch.prim.ListConstruct"(%25614, %18477, %25615, %25616, %25617) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25619 = "torch.aten.view"(%25258, %25618) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25619, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25620 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25621 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25622 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25623 = "torch.prim.ListConstruct"(%19011, %25620, %25621, %25622) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25624 = "torch.aten.view"(%25619, %25623) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25624, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25625 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25626 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25627 = "torch.aten.add.Scalar"(%25563, %25625, %25626) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%25627, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %25628 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %25629 = "torch.aten.view"(%25627, %25628) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%25629, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %25630 = "torch.prim.ListConstruct"(%25629) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %25631 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25632 = "torch.aten.index_put"(%25613, %25630, %25624, %25631) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25632, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25633 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25634 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25635 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25636 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25637 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25638 = "torch.prim.ListConstruct"(%18479, %25633, %25634, %25635, %25636, %25637) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25639 = "torch.aten.view"(%25632, %25638) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25639, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25640 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %25641 = "torch.prim.ListConstruct"(%18479, %25640) : (!torch.int, !torch.int) -> !torch.list<int>
    %25642 = "torch.aten.view"(%25639, %25641) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25642, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %25643 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %25644 = "torch.aten.unsqueeze"(%25558, %25643) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25644, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25645 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25646 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25647 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25648 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25649 = "torch.prim.ListConstruct"(%25645, %18481, %25646, %25647, %25648) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25650 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25651 = "torch.aten.expand"(%25644, %25649, %25650) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25651, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25652 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25653 = "torch.aten.clone"(%25651, %25652) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25653, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25654 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25655 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25656 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25657 = "torch.prim.ListConstruct"(%25654, %18481, %25655, %25656) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25658 = "torch.aten._unsafe_view"(%25653, %25657) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25658, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25659 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %25660 = "torch.aten.unsqueeze"(%25258, %25659) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25660, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25661 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25662 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25663 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25664 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25665 = "torch.prim.ListConstruct"(%25661, %18481, %25662, %25663, %25664) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25666 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25667 = "torch.aten.expand"(%25660, %25665, %25666) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25667, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25668 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25669 = "torch.aten.clone"(%25667, %25668) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25669, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25670 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25671 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25672 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25673 = "torch.prim.ListConstruct"(%25670, %18481, %25671, %25672) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25674 = "torch.aten._unsafe_view"(%25669, %25673) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25674, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25675 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25676 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25677 = "torch.aten.transpose.int"(%25408, %25675, %25676) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25677, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25678 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25679 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25680 = "torch.aten.transpose.int"(%25658, %25678, %25679) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25680, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25681 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25682 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25683 = "torch.aten.transpose.int"(%25674, %25681, %25682) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25683, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25684 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25685 = "torch.aten.squeeze.dim"(%18570, %25684) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25685, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %25686 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25687 = "torch.aten.squeeze.dim"(%25685, %25686) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25687, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %25688 = "torch_c.to_builtin_tensor"(%25677) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %25689 = "torch_c.to_builtin_tensor"(%25680) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %25690 = "torch_c.to_builtin_tensor"(%25683) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %25691 = "torch_c.to_builtin_tensor"(%25687) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %25692 = "tensor.cast"(%25691) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %25693 = "torch_c.to_builtin_tensor"(%17609) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %25694 = "util.call"(%25688, %25689, %25690, %25693, %25692) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %25695 = "torch_c.from_builtin_tensor"(%25694) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%25695, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %25696 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25697 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25698 = "torch.aten.transpose.int"(%25695, %25696, %25697) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%25698, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %25699 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25700 = "torch.aten.clone"(%25698, %25699) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%25700, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %25701 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25702 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25703 = "torch.prim.ListConstruct"(%25701, %18481, %25702) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25704 = "torch.aten._unsafe_view"(%25700, %25703) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25704, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25705 = "torch.aten.div.Tensor"(%25704, %17611) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25705, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25706 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25707 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25708 = "torch.aten.clamp"(%25705, %25706, %25707) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25708, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25709 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25710 = "torch.prims.convert_element_type"(%25708, %25709) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25710, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25711 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25712 = "torch.aten.unsqueeze"(%17613, %25711) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %25713 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25714 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25715 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25716 = "torch.prim.ListConstruct"(%25713, %25714, %25715) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25717 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25718 = "torch.aten.expand"(%25712, %25716, %25717) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %25719 = "torch_c.to_builtin_tensor"(%25710) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25720 = "torch_c.to_builtin_tensor"(%25718) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %25721 = "util.call"(%25719, %25720) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %25722 = "torch_c.from_builtin_tensor"(%25721) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25722, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25723 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25724 = "torch.prims.convert_element_type"(%25722, %25723) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25724, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25725 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25726 = "torch.aten.add.Tensor"(%25152, %25724, %25725) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25726, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25727 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25728 = "torch.prims.convert_element_type"(%25726, %25727) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25728, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25729 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25730 = "torch.aten.pow.Tensor_Scalar"(%25728, %25729) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25730, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25731 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25732 = "torch.prim.ListConstruct"(%25731) : (!torch.int) -> !torch.list<int>
    %25733 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %25734 = "torch.constant.none"() : () -> !torch.none
    %25735 = "torch.aten.mean.dim"(%25730, %25732, %25733, %25734) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25735, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25736 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %25737 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25738 = "torch.aten.add.Scalar"(%25735, %25736, %25737) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25738, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25739 = "torch.aten.rsqrt"(%25738) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25739, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25740 = "torch.aten.mul.Tensor"(%25728, %25739) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25740, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25741 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25742 = "torch.prims.convert_element_type"(%25740, %25741) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25742, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25743 = "torch.aten.mul.Tensor"(%17615, %25742) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25743, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25744 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25745 = "torch.prims.convert_element_type"(%25743, %25744) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25745, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25746 = "torch.aten.div.Tensor"(%25745, %17617) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25746, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25747 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25748 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25749 = "torch.aten.clamp"(%25746, %25747, %25748) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25749, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25750 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25751 = "torch.prims.convert_element_type"(%25749, %25750) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25751, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25752 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25753 = "torch.aten.unsqueeze"(%17619, %25752) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %25754 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25755 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %25756 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25757 = "torch.prim.ListConstruct"(%25754, %25755, %25756) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25758 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25759 = "torch.aten.expand"(%25753, %25757, %25758) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %25760 = "torch_c.to_builtin_tensor"(%25751) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25761 = "torch_c.to_builtin_tensor"(%25759) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %25762 = "util.call"(%25760, %25761) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %25763 = "torch_c.from_builtin_tensor"(%25762) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%25763, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %25764 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25765 = "torch.prims.convert_element_type"(%25763, %25764) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25765, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25766 = "torch.aten.silu"(%25765) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25766, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25767 = "torch.aten.div.Tensor"(%25745, %17621) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25767, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25768 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25769 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25770 = "torch.aten.clamp"(%25767, %25768, %25769) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25770, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25771 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25772 = "torch.prims.convert_element_type"(%25770, %25771) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25772, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25773 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25774 = "torch.aten.unsqueeze"(%17623, %25773) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %25775 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25776 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %25777 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25778 = "torch.prim.ListConstruct"(%25775, %25776, %25777) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25779 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25780 = "torch.aten.expand"(%25774, %25778, %25779) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %25781 = "torch_c.to_builtin_tensor"(%25772) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25782 = "torch_c.to_builtin_tensor"(%25780) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %25783 = "util.call"(%25781, %25782) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %25784 = "torch_c.from_builtin_tensor"(%25783) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%25784, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %25785 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25786 = "torch.prims.convert_element_type"(%25784, %25785) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25786, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25787 = "torch.aten.mul.Tensor"(%25766, %25786) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25787, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25788 = "torch.aten.div.Tensor"(%25787, %17625) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25788, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25789 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25790 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25791 = "torch.aten.clamp"(%25788, %25789, %25790) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%25791, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %25792 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25793 = "torch.prims.convert_element_type"(%25791, %25792) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25793, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %25794 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25795 = "torch.aten.unsqueeze"(%17627, %25794) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %25796 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25797 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25798 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %25799 = "torch.prim.ListConstruct"(%25796, %25797, %25798) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25800 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25801 = "torch.aten.expand"(%25795, %25799, %25800) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %25802 = "torch_c.to_builtin_tensor"(%25793) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %25803 = "torch_c.to_builtin_tensor"(%25801) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %25804 = "util.call"(%25802, %25803) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %25805 = "torch_c.from_builtin_tensor"(%25804) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25805, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25806 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25807 = "torch.prims.convert_element_type"(%25805, %25806) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25807, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25808 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25809 = "torch.aten.add.Tensor"(%25726, %25807, %25808) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25809, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25810 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25811 = "torch.prims.convert_element_type"(%25809, %25810) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25811, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25812 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25813 = "torch.aten.pow.Tensor_Scalar"(%25811, %25812) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25813, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25814 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25815 = "torch.prim.ListConstruct"(%25814) : (!torch.int) -> !torch.list<int>
    %25816 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %25817 = "torch.constant.none"() : () -> !torch.none
    %25818 = "torch.aten.mean.dim"(%25813, %25815, %25816, %25817) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25818, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25819 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %25820 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25821 = "torch.aten.add.Scalar"(%25818, %25819, %25820) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25821, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25822 = "torch.aten.rsqrt"(%25821) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%25822, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %25823 = "torch.aten.mul.Tensor"(%25811, %25822) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25823, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25824 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25825 = "torch.prims.convert_element_type"(%25823, %25824) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25825, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25826 = "torch.aten.mul.Tensor"(%17629, %25825) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25826, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25827 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25828 = "torch.prims.convert_element_type"(%25826, %25827) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25828, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25829 = "torch.aten.div.Tensor"(%25828, %17631) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25829, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25830 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25831 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25832 = "torch.aten.clamp"(%25829, %25830, %25831) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25832, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25833 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25834 = "torch.prims.convert_element_type"(%25832, %25833) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25834, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25835 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25836 = "torch.aten.unsqueeze"(%17633, %25835) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %25837 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25838 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25839 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25840 = "torch.prim.ListConstruct"(%25837, %25838, %25839) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25841 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25842 = "torch.aten.expand"(%25836, %25840, %25841) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %25843 = "torch_c.to_builtin_tensor"(%25834) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25844 = "torch_c.to_builtin_tensor"(%25842) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %25845 = "util.call"(%25843, %25844) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %25846 = "torch_c.from_builtin_tensor"(%25845) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25846, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25847 = "torch.aten.div.Tensor"(%25846, %17635) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25847, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25848 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25849 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25850 = "torch.aten.clamp"(%25847, %25848, %25849) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%25850, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %25851 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25852 = "torch.prims.convert_element_type"(%25850, %25851) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25852, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25853 = "torch.aten.div.Tensor"(%25828, %17637) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25853, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25854 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25855 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25856 = "torch.aten.clamp"(%25853, %25854, %25855) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25856, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25857 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25858 = "torch.prims.convert_element_type"(%25856, %25857) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25858, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25859 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25860 = "torch.aten.unsqueeze"(%17639, %25859) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %25861 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25862 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %25863 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25864 = "torch.prim.ListConstruct"(%25861, %25862, %25863) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25865 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25866 = "torch.aten.expand"(%25860, %25864, %25865) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %25867 = "torch_c.to_builtin_tensor"(%25858) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25868 = "torch_c.to_builtin_tensor"(%25866) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %25869 = "util.call"(%25867, %25868) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %25870 = "torch_c.from_builtin_tensor"(%25869) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25870, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25871 = "torch.aten.div.Tensor"(%25870, %17641) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25871, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25872 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25873 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25874 = "torch.aten.clamp"(%25871, %25872, %25873) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25874, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25875 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25876 = "torch.prims.convert_element_type"(%25874, %25875) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25876, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %25877 = "torch.aten.div.Tensor"(%25828, %17643) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25877, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25878 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25879 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25880 = "torch.aten.clamp"(%25877, %25878, %25879) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%25880, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %25881 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25882 = "torch.prims.convert_element_type"(%25880, %25881) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25882, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %25883 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25884 = "torch.aten.unsqueeze"(%17645, %25883) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %25885 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25886 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %25887 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %25888 = "torch.prim.ListConstruct"(%25885, %25886, %25887) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25889 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25890 = "torch.aten.expand"(%25884, %25888, %25889) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %25891 = "torch_c.to_builtin_tensor"(%25882) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %25892 = "torch_c.to_builtin_tensor"(%25890) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %25893 = "util.call"(%25891, %25892) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %25894 = "torch_c.from_builtin_tensor"(%25893) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25894, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25895 = "torch.aten.div.Tensor"(%25894, %17647) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25895, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25896 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %25897 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %25898 = "torch.aten.clamp"(%25895, %25896, %25897) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%25898, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %25899 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %25900 = "torch.prims.convert_element_type"(%25898, %25899) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25900, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %25901 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25902 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %25903 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25904 = "torch.prim.ListConstruct"(%25901, %18481, %25902, %25903) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25905 = "torch.aten.view"(%25852, %25904) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25905, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25906 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25907 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25908 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25909 = "torch.prim.ListConstruct"(%25906, %18481, %25907, %25908) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25910 = "torch.aten.view"(%25876, %25909) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25910, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25911 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25912 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25913 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25914 = "torch.prim.ListConstruct"(%25911, %18481, %25912, %25913) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %25915 = "torch.aten.view"(%25900, %25914) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%25915, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %25916 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %25917 = "torch.constant.none"() : () -> !torch.none
    %25918 = "torch.constant.none"() : () -> !torch.none
    %25919 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %25920 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25921 = "torch.aten.arange"(%25916, %25917, %25918, %25919, %25920) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %25922 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25923 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25924 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %25925 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %25926 = "torch.constant.none"() : () -> !torch.none
    %25927 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %25928 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %25929 = "torch.aten.arange.start_step"(%25922, %25923, %25924, %25925, %25926, %25927, %25928) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %25930 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25931 = "torch.prims.convert_element_type"(%25929, %25930) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %25932 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %25933 = "torch.aten.div.Scalar"(%25931, %25932) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25934 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %25935 = "torch.aten.pow.Scalar"(%25934, %25933) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25936 = "torch.aten.reciprocal"(%25935) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25937 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %25938 = "torch.aten.mul.Scalar"(%25936, %25937) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %25939 = "torch.aten.reciprocal"(%25938) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25940 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %25941 = "torch.aten.mul.Scalar"(%25939, %25940) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %25942 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %25943 = "torch.aten.gt.Scalar"(%25941, %25942) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25944 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25945 = "torch.aten.div.Scalar"(%25938, %25944) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25946 = "torch.aten.where.self"(%25943, %25945, %25938) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25947 = "torch.aten.reciprocal"(%25941) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25948 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %25949 = "torch.aten.mul.Scalar"(%25947, %25948) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25950 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25951 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25952 = "torch.aten.sub.Scalar"(%25949, %25950, %25951) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %25953 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %25954 = "torch.aten.div.Scalar"(%25952, %25953) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25955 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25956 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25957 = "torch.aten.rsub.Scalar"(%25954, %25955, %25956) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %25958 = "torch.aten.mul.Tensor"(%25957, %25946) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25959 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %25960 = "torch.aten.div.Scalar"(%25958, %25959) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25961 = "torch.aten.mul.Tensor"(%25954, %25946) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25962 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25963 = "torch.aten.add.Tensor"(%25960, %25961, %25962) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %25964 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %25965 = "torch.aten.lt.Scalar"(%25941, %25964) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25966 = "torch.aten.bitwise_not"(%25965) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25967 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %25968 = "torch.aten.gt.Scalar"(%25941, %25967) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %25969 = "torch.aten.bitwise_not"(%25968) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25970 = "torch.aten.mul.Tensor"(%25966, %25969) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %25971 = "torch.aten.where.self"(%25970, %25963, %25946) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %25972 = "torch.prim.ListConstruct"(%25971, %25971) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %25973 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %25974 = "torch.aten.cat"(%25972, %25973) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %25975 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25976 = "torch.prims.convert_element_type"(%25921, %25975) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %25977 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %25978 = "torch.prims.convert_element_type"(%25974, %25977) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %25979 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %25980 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25981 = "torch.prim.ListConstruct"(%25979, %25980) : (!torch.int, !torch.int) -> !torch.list<int>
    %25982 = "torch.aten.view"(%25976, %25981) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %25983 = "torch.aten.mul.Tensor"(%25982, %25978) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25984 = "torch.aten.cos"(%25983) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25985 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25986 = "torch.prims.convert_element_type"(%25984, %25985) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %25987 = "torch.aten.sin"(%25983) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %25988 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %25989 = "torch.prims.convert_element_type"(%25987, %25988) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %25990 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25991 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25992 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25993 = "torch.aten.slice.Tensor"(%25986, %25990, %25991, %18481, %25992) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25993, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25994 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25995 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %25996 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %25997 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %25998 = "torch.aten.slice.Tensor"(%25993, %25994, %25995, %25996, %25997) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%25998, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %25999 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26000 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26001 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26002 = "torch.aten.slice.Tensor"(%25989, %25999, %26000, %18481, %26001) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26002, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26003 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26004 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26005 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26006 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26007 = "torch.aten.slice.Tensor"(%26002, %26003, %26004, %26005, %26006) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26007, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26008 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26009 = "torch.aten.unsqueeze"(%25998, %26008) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26009, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26010 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26011 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26012 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26013 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26014 = "torch.aten.slice.Tensor"(%26009, %26010, %26011, %26012, %26013) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26014, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26015 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26016 = "torch.aten.unsqueeze"(%26014, %26015) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26016, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26017 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26018 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26019 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26020 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26021 = "torch.aten.slice.Tensor"(%26016, %26017, %26018, %26019, %26020) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26021, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26022 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26023 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26024 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26025 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26026 = "torch.prim.ListConstruct"(%26022, %26023, %26024, %26025) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26027 = "torch.aten.repeat"(%26021, %26026) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26027, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26028 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26029 = "torch.aten.unsqueeze"(%26007, %26028) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26029, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26030 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26031 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26032 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26033 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26034 = "torch.aten.slice.Tensor"(%26029, %26030, %26031, %26032, %26033) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26034, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26035 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26036 = "torch.aten.unsqueeze"(%26034, %26035) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26036, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26037 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26038 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26039 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26040 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26041 = "torch.aten.slice.Tensor"(%26036, %26037, %26038, %26039, %26040) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26041, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26042 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26043 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26044 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26045 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26046 = "torch.prim.ListConstruct"(%26042, %26043, %26044, %26045) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26047 = "torch.aten.repeat"(%26041, %26046) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26047, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26048 = "torch.aten.mul.Tensor"(%25905, %26027) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26048, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26049 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26050 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26051 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26052 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26053 = "torch.aten.slice.Tensor"(%25905, %26049, %26050, %26051, %26052) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26053, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26054 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26055 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26056 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26057 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26058 = "torch.aten.slice.Tensor"(%25905, %26054, %26055, %26056, %26057) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26058, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26059 = "torch.aten.neg"(%26058) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26059, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26060 = "torch.prim.ListConstruct"(%26059, %26053) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %26061 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26062 = "torch.aten.cat"(%26060, %26061) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26062, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26063 = "torch.aten.mul.Tensor"(%26062, %26047) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26063, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26064 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26065 = "torch.aten.add.Tensor"(%26048, %26063, %26064) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26065, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26066 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %26067 = "torch.constant.none"() : () -> !torch.none
    %26068 = "torch.constant.none"() : () -> !torch.none
    %26069 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %26070 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26071 = "torch.aten.arange"(%26066, %26067, %26068, %26069, %26070) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %26072 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26073 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26074 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26075 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26076 = "torch.constant.none"() : () -> !torch.none
    %26077 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %26078 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26079 = "torch.aten.arange.start_step"(%26072, %26073, %26074, %26075, %26076, %26077, %26078) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %26080 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26081 = "torch.prims.convert_element_type"(%26079, %26080) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %26082 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26083 = "torch.aten.div.Scalar"(%26081, %26082) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26084 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %26085 = "torch.aten.pow.Scalar"(%26084, %26083) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26086 = "torch.aten.reciprocal"(%26085) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26087 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %26088 = "torch.aten.mul.Scalar"(%26086, %26087) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %26089 = "torch.aten.reciprocal"(%26088) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26090 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %26091 = "torch.aten.mul.Scalar"(%26089, %26090) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %26092 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %26093 = "torch.aten.gt.Scalar"(%26091, %26092) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26094 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26095 = "torch.aten.div.Scalar"(%26088, %26094) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26096 = "torch.aten.where.self"(%26093, %26095, %26088) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26097 = "torch.aten.reciprocal"(%26091) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26098 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %26099 = "torch.aten.mul.Scalar"(%26097, %26098) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26100 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26101 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26102 = "torch.aten.sub.Scalar"(%26099, %26100, %26101) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %26103 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26104 = "torch.aten.div.Scalar"(%26102, %26103) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26105 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26106 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26107 = "torch.aten.rsub.Scalar"(%26104, %26105, %26106) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %26108 = "torch.aten.mul.Tensor"(%26107, %26096) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26109 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26110 = "torch.aten.div.Scalar"(%26108, %26109) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26111 = "torch.aten.mul.Tensor"(%26104, %26096) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26112 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26113 = "torch.aten.add.Tensor"(%26110, %26111, %26112) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26114 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %26115 = "torch.aten.lt.Scalar"(%26091, %26114) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26116 = "torch.aten.bitwise_not"(%26115) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26117 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %26118 = "torch.aten.gt.Scalar"(%26091, %26117) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26119 = "torch.aten.bitwise_not"(%26118) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26120 = "torch.aten.mul.Tensor"(%26116, %26119) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26121 = "torch.aten.where.self"(%26120, %26113, %26096) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26122 = "torch.prim.ListConstruct"(%26121, %26121) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %26123 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26124 = "torch.aten.cat"(%26122, %26123) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %26125 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26126 = "torch.prims.convert_element_type"(%26071, %26125) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %26127 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26128 = "torch.prims.convert_element_type"(%26124, %26127) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %26129 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %26130 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26131 = "torch.prim.ListConstruct"(%26129, %26130) : (!torch.int, !torch.int) -> !torch.list<int>
    %26132 = "torch.aten.view"(%26126, %26131) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %26133 = "torch.aten.mul.Tensor"(%26132, %26128) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26134 = "torch.aten.cos"(%26133) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26135 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26136 = "torch.prims.convert_element_type"(%26134, %26135) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %26137 = "torch.aten.sin"(%26133) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26138 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26139 = "torch.prims.convert_element_type"(%26137, %26138) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %26140 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26141 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26142 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26143 = "torch.aten.slice.Tensor"(%26136, %26140, %26141, %18481, %26142) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26143, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26144 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26145 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26146 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26147 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26148 = "torch.aten.slice.Tensor"(%26143, %26144, %26145, %26146, %26147) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26148, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26149 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26150 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26151 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26152 = "torch.aten.slice.Tensor"(%26139, %26149, %26150, %18481, %26151) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26152, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26153 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26154 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26155 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26156 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26157 = "torch.aten.slice.Tensor"(%26152, %26153, %26154, %26155, %26156) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26157, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26158 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26159 = "torch.aten.unsqueeze"(%26148, %26158) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26159, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26160 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26161 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26162 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26163 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26164 = "torch.aten.slice.Tensor"(%26159, %26160, %26161, %26162, %26163) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26164, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26165 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26166 = "torch.aten.unsqueeze"(%26164, %26165) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26166, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26167 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26168 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26169 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26170 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26171 = "torch.aten.slice.Tensor"(%26166, %26167, %26168, %26169, %26170) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26171, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26172 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26173 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26174 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26175 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26176 = "torch.prim.ListConstruct"(%26172, %26173, %26174, %26175) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26177 = "torch.aten.repeat"(%26171, %26176) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26177, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26178 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26179 = "torch.aten.unsqueeze"(%26157, %26178) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26179, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26180 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26181 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26182 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26183 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26184 = "torch.aten.slice.Tensor"(%26179, %26180, %26181, %26182, %26183) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26184, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26185 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26186 = "torch.aten.unsqueeze"(%26184, %26185) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26186, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26187 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26188 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26189 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26190 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26191 = "torch.aten.slice.Tensor"(%26186, %26187, %26188, %26189, %26190) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26191, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26192 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26193 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26194 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26195 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26196 = "torch.prim.ListConstruct"(%26192, %26193, %26194, %26195) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26197 = "torch.aten.repeat"(%26191, %26196) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26197, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26198 = "torch.aten.mul.Tensor"(%25910, %26177) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26198, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26199 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26200 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26201 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26202 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26203 = "torch.aten.slice.Tensor"(%25910, %26199, %26200, %26201, %26202) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26203, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26204 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26205 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26206 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26207 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26208 = "torch.aten.slice.Tensor"(%25910, %26204, %26205, %26206, %26207) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26208, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26209 = "torch.aten.neg"(%26208) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26209, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26210 = "torch.prim.ListConstruct"(%26209, %26203) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %26211 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26212 = "torch.aten.cat"(%26210, %26211) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26212, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26213 = "torch.aten.mul.Tensor"(%26212, %26197) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26213, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26215 = "torch.aten.add.Tensor"(%26198, %26213, %26214) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26215, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26216 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26217 = "torch.aten.mul.Scalar"(%arg69, %26216) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%26217, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %26218 = "torch.constant.int"() <{value = 22 : i64}> : () -> !torch.int
    %26219 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26220 = "torch.aten.add.Scalar"(%26217, %26218, %26219) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%26220, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %26221 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26222 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26223 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26224 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26225 = "torch.prim.ListConstruct"(%26221, %18477, %26222, %26223, %26224) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26226 = "torch.aten.view"(%26215, %26225) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26226, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26227 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26228 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26229 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26230 = "torch.prim.ListConstruct"(%19011, %26227, %26228, %26229) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26231 = "torch.aten.view"(%26226, %26230) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26231, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26232 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %26233 = "torch.aten.view"(%26220, %26232) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%26233, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %26234 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26235 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26236 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26237 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26238 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26239 = "torch.prim.ListConstruct"(%18479, %26234, %26235, %26236, %26237, %26238) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26240 = "torch.aten.view"(%25642, %26239) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26240, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26241 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26242 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26243 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26244 = "torch.prim.ListConstruct"(%18993, %26241, %26242, %26243) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26245 = "torch.aten.view"(%26240, %26244) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26245, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26246 = "torch.prim.ListConstruct"(%26233) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %26247 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26248 = "torch.aten.index_put"(%26245, %26246, %26231, %26247) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26248, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26249 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26250 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26251 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26252 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26253 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26254 = "torch.prim.ListConstruct"(%18479, %26249, %26250, %26251, %26252, %26253) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26255 = "torch.aten.view"(%26248, %26254) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26255, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26256 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %26257 = "torch.prim.ListConstruct"(%18479, %26256) : (!torch.int, !torch.int) -> !torch.list<int>
    %26258 = "torch.aten.view"(%26255, %26257) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26258, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %26259 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26260 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26261 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26262 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26263 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26264 = "torch.prim.ListConstruct"(%18479, %26259, %26260, %26261, %26262, %26263) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26265 = "torch.aten.view"(%26258, %26264) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26265, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26266 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26267 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26268 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26269 = "torch.prim.ListConstruct"(%18993, %26266, %26267, %26268) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26270 = "torch.aten.view"(%26265, %26269) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26270, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26271 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26272 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26273 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26274 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26275 = "torch.prim.ListConstruct"(%26271, %18477, %26272, %26273, %26274) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26276 = "torch.aten.view"(%25915, %26275) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26276, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26277 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26278 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26279 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26280 = "torch.prim.ListConstruct"(%19011, %26277, %26278, %26279) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26281 = "torch.aten.view"(%26276, %26280) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26281, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26282 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26283 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26284 = "torch.aten.add.Scalar"(%26220, %26282, %26283) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%26284, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %26285 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %26286 = "torch.aten.view"(%26284, %26285) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%26286, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %26287 = "torch.prim.ListConstruct"(%26286) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %26288 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26289 = "torch.aten.index_put"(%26270, %26287, %26281, %26288) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26289, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26290 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26291 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26292 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26293 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26294 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26295 = "torch.prim.ListConstruct"(%18479, %26290, %26291, %26292, %26293, %26294) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26296 = "torch.aten.view"(%26289, %26295) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26296, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26297 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %26298 = "torch.prim.ListConstruct"(%18479, %26297) : (!torch.int, !torch.int) -> !torch.list<int>
    %26299 = "torch.aten.view"(%26296, %26298) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26299, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %26300 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %26301 = "torch.aten.unsqueeze"(%26215, %26300) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26301, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26302 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26303 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26304 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26305 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26306 = "torch.prim.ListConstruct"(%26302, %18481, %26303, %26304, %26305) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26307 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26308 = "torch.aten.expand"(%26301, %26306, %26307) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26308, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26309 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26310 = "torch.aten.clone"(%26308, %26309) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26310, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26311 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26312 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26313 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26314 = "torch.prim.ListConstruct"(%26311, %18481, %26312, %26313) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26315 = "torch.aten._unsafe_view"(%26310, %26314) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26315, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26316 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %26317 = "torch.aten.unsqueeze"(%25915, %26316) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26317, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26318 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26319 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26320 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26321 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26322 = "torch.prim.ListConstruct"(%26318, %18481, %26319, %26320, %26321) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26323 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26324 = "torch.aten.expand"(%26317, %26322, %26323) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26324, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26325 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26326 = "torch.aten.clone"(%26324, %26325) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26326, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26327 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26328 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26329 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26330 = "torch.prim.ListConstruct"(%26327, %18481, %26328, %26329) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26331 = "torch.aten._unsafe_view"(%26326, %26330) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26331, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26332 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26333 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26334 = "torch.aten.transpose.int"(%26065, %26332, %26333) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26334, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26335 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26336 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26337 = "torch.aten.transpose.int"(%26315, %26335, %26336) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26337, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26338 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26339 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26340 = "torch.aten.transpose.int"(%26331, %26338, %26339) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26340, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26341 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26342 = "torch.aten.squeeze.dim"(%18570, %26341) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26342, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %26343 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26344 = "torch.aten.squeeze.dim"(%26342, %26343) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26344, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %26345 = "torch_c.to_builtin_tensor"(%26334) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %26346 = "torch_c.to_builtin_tensor"(%26337) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %26347 = "torch_c.to_builtin_tensor"(%26340) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %26348 = "torch_c.to_builtin_tensor"(%26344) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %26349 = "tensor.cast"(%26348) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %26350 = "torch_c.to_builtin_tensor"(%17649) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %26351 = "util.call"(%26345, %26346, %26347, %26350, %26349) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %26352 = "torch_c.from_builtin_tensor"(%26351) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%26352, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %26353 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26354 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26355 = "torch.aten.transpose.int"(%26352, %26353, %26354) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%26355, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %26356 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26357 = "torch.aten.clone"(%26355, %26356) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%26357, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %26358 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26359 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26360 = "torch.prim.ListConstruct"(%26358, %18481, %26359) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26361 = "torch.aten._unsafe_view"(%26357, %26360) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26361, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26362 = "torch.aten.div.Tensor"(%26361, %17651) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26362, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26363 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26364 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26365 = "torch.aten.clamp"(%26362, %26363, %26364) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26365, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26366 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26367 = "torch.prims.convert_element_type"(%26365, %26366) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26367, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %26368 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26369 = "torch.aten.unsqueeze"(%17653, %26368) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %26370 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26371 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26372 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26373 = "torch.prim.ListConstruct"(%26370, %26371, %26372) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26374 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26375 = "torch.aten.expand"(%26369, %26373, %26374) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %26376 = "torch_c.to_builtin_tensor"(%26367) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %26377 = "torch_c.to_builtin_tensor"(%26375) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %26378 = "util.call"(%26376, %26377) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %26379 = "torch_c.from_builtin_tensor"(%26378) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26379, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26380 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26381 = "torch.prims.convert_element_type"(%26379, %26380) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26381, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26382 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26383 = "torch.aten.add.Tensor"(%25809, %26381, %26382) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26383, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26384 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26385 = "torch.prims.convert_element_type"(%26383, %26384) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26385, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26386 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26387 = "torch.aten.pow.Tensor_Scalar"(%26385, %26386) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26387, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26388 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26389 = "torch.prim.ListConstruct"(%26388) : (!torch.int) -> !torch.list<int>
    %26390 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %26391 = "torch.constant.none"() : () -> !torch.none
    %26392 = "torch.aten.mean.dim"(%26387, %26389, %26390, %26391) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%26392, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %26393 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %26394 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26395 = "torch.aten.add.Scalar"(%26392, %26393, %26394) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%26395, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %26396 = "torch.aten.rsqrt"(%26395) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%26396, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %26397 = "torch.aten.mul.Tensor"(%26385, %26396) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26397, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26398 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26399 = "torch.prims.convert_element_type"(%26397, %26398) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26399, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26400 = "torch.aten.mul.Tensor"(%17655, %26399) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26400, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26401 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26402 = "torch.prims.convert_element_type"(%26400, %26401) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26402, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26403 = "torch.aten.div.Tensor"(%26402, %17657) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26403, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26404 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26405 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26406 = "torch.aten.clamp"(%26403, %26404, %26405) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26406, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26407 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26408 = "torch.prims.convert_element_type"(%26406, %26407) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26408, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %26409 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26410 = "torch.aten.unsqueeze"(%17659, %26409) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %26411 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26412 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %26413 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26414 = "torch.prim.ListConstruct"(%26411, %26412, %26413) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26415 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26416 = "torch.aten.expand"(%26410, %26414, %26415) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %26417 = "torch_c.to_builtin_tensor"(%26408) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %26418 = "torch_c.to_builtin_tensor"(%26416) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %26419 = "util.call"(%26417, %26418) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %26420 = "torch_c.from_builtin_tensor"(%26419) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%26420, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %26421 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26422 = "torch.prims.convert_element_type"(%26420, %26421) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%26422, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %26423 = "torch.aten.silu"(%26422) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%26423, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %26424 = "torch.aten.div.Tensor"(%26402, %17661) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26424, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26425 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26426 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26427 = "torch.aten.clamp"(%26424, %26425, %26426) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26427, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26428 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26429 = "torch.prims.convert_element_type"(%26427, %26428) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26429, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %26430 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26431 = "torch.aten.unsqueeze"(%17663, %26430) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %26432 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26433 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %26434 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26435 = "torch.prim.ListConstruct"(%26432, %26433, %26434) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26436 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26437 = "torch.aten.expand"(%26431, %26435, %26436) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %26438 = "torch_c.to_builtin_tensor"(%26429) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %26439 = "torch_c.to_builtin_tensor"(%26437) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %26440 = "util.call"(%26438, %26439) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %26441 = "torch_c.from_builtin_tensor"(%26440) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%26441, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %26442 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26443 = "torch.prims.convert_element_type"(%26441, %26442) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%26443, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %26444 = "torch.aten.mul.Tensor"(%26423, %26443) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%26444, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %26445 = "torch.aten.div.Tensor"(%26444, %17665) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%26445, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %26446 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26447 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26448 = "torch.aten.clamp"(%26445, %26446, %26447) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%26448, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %26449 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26450 = "torch.prims.convert_element_type"(%26448, %26449) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26450, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %26451 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26452 = "torch.aten.unsqueeze"(%17667, %26451) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %26453 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26454 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26455 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %26456 = "torch.prim.ListConstruct"(%26453, %26454, %26455) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26457 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26458 = "torch.aten.expand"(%26452, %26456, %26457) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %26459 = "torch_c.to_builtin_tensor"(%26450) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %26460 = "torch_c.to_builtin_tensor"(%26458) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %26461 = "util.call"(%26459, %26460) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %26462 = "torch_c.from_builtin_tensor"(%26461) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26462, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26463 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26464 = "torch.prims.convert_element_type"(%26462, %26463) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26464, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26465 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26466 = "torch.aten.add.Tensor"(%26383, %26464, %26465) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26466, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26467 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26468 = "torch.prims.convert_element_type"(%26466, %26467) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26468, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26469 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26470 = "torch.aten.pow.Tensor_Scalar"(%26468, %26469) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26470, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26471 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26472 = "torch.prim.ListConstruct"(%26471) : (!torch.int) -> !torch.list<int>
    %26473 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %26474 = "torch.constant.none"() : () -> !torch.none
    %26475 = "torch.aten.mean.dim"(%26470, %26472, %26473, %26474) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%26475, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %26476 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %26477 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26478 = "torch.aten.add.Scalar"(%26475, %26476, %26477) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%26478, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %26479 = "torch.aten.rsqrt"(%26478) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%26479, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %26480 = "torch.aten.mul.Tensor"(%26468, %26479) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26480, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26481 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26482 = "torch.prims.convert_element_type"(%26480, %26481) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26482, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26483 = "torch.aten.mul.Tensor"(%17669, %26482) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26483, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26484 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26485 = "torch.prims.convert_element_type"(%26483, %26484) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26485, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26486 = "torch.aten.div.Tensor"(%26485, %17671) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26486, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26487 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26488 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26489 = "torch.aten.clamp"(%26486, %26487, %26488) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26489, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26490 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26491 = "torch.prims.convert_element_type"(%26489, %26490) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26491, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %26492 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26493 = "torch.aten.unsqueeze"(%17673, %26492) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %26494 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26495 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26496 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26497 = "torch.prim.ListConstruct"(%26494, %26495, %26496) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26498 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26499 = "torch.aten.expand"(%26493, %26497, %26498) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %26500 = "torch_c.to_builtin_tensor"(%26491) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %26501 = "torch_c.to_builtin_tensor"(%26499) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %26502 = "util.call"(%26500, %26501) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %26503 = "torch_c.from_builtin_tensor"(%26502) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26503, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26504 = "torch.aten.div.Tensor"(%26503, %17675) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26504, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26505 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26506 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26507 = "torch.aten.clamp"(%26504, %26505, %26506) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%26507, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %26508 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26509 = "torch.prims.convert_element_type"(%26507, %26508) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26509, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %26510 = "torch.aten.div.Tensor"(%26485, %17677) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26510, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26511 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26512 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26513 = "torch.aten.clamp"(%26510, %26511, %26512) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26513, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26514 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26515 = "torch.prims.convert_element_type"(%26513, %26514) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26515, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %26516 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26517 = "torch.aten.unsqueeze"(%17679, %26516) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %26518 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26519 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %26520 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26521 = "torch.prim.ListConstruct"(%26518, %26519, %26520) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26522 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26523 = "torch.aten.expand"(%26517, %26521, %26522) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %26524 = "torch_c.to_builtin_tensor"(%26515) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %26525 = "torch_c.to_builtin_tensor"(%26523) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %26526 = "util.call"(%26524, %26525) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %26527 = "torch_c.from_builtin_tensor"(%26526) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%26527, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %26528 = "torch.aten.div.Tensor"(%26527, %17681) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%26528, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %26529 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26530 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26531 = "torch.aten.clamp"(%26528, %26529, %26530) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%26531, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %26532 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26533 = "torch.prims.convert_element_type"(%26531, %26532) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26533, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %26534 = "torch.aten.div.Tensor"(%26485, %17683) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26534, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26535 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26536 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26537 = "torch.aten.clamp"(%26534, %26535, %26536) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%26537, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %26538 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26539 = "torch.prims.convert_element_type"(%26537, %26538) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26539, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %26540 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26541 = "torch.aten.unsqueeze"(%17685, %26540) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %26542 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26543 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %26544 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %26545 = "torch.prim.ListConstruct"(%26542, %26543, %26544) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26546 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26547 = "torch.aten.expand"(%26541, %26545, %26546) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %26548 = "torch_c.to_builtin_tensor"(%26539) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %26549 = "torch_c.to_builtin_tensor"(%26547) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %26550 = "util.call"(%26548, %26549) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %26551 = "torch_c.from_builtin_tensor"(%26550) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%26551, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %26552 = "torch.aten.div.Tensor"(%26551, %17687) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%26552, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %26553 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %26554 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %26555 = "torch.aten.clamp"(%26552, %26553, %26554) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%26555, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %26556 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %26557 = "torch.prims.convert_element_type"(%26555, %26556) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26557, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %26558 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26559 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26560 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26561 = "torch.prim.ListConstruct"(%26558, %18481, %26559, %26560) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26562 = "torch.aten.view"(%26509, %26561) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26562, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26563 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26564 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26565 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26566 = "torch.prim.ListConstruct"(%26563, %18481, %26564, %26565) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26567 = "torch.aten.view"(%26533, %26566) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26567, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26568 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26569 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26570 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26571 = "torch.prim.ListConstruct"(%26568, %18481, %26569, %26570) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26572 = "torch.aten.view"(%26557, %26571) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26572, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26573 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %26574 = "torch.constant.none"() : () -> !torch.none
    %26575 = "torch.constant.none"() : () -> !torch.none
    %26576 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %26577 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26578 = "torch.aten.arange"(%26573, %26574, %26575, %26576, %26577) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %26579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26580 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26581 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26582 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26583 = "torch.constant.none"() : () -> !torch.none
    %26584 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %26585 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26586 = "torch.aten.arange.start_step"(%26579, %26580, %26581, %26582, %26583, %26584, %26585) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %26587 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26588 = "torch.prims.convert_element_type"(%26586, %26587) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %26589 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26590 = "torch.aten.div.Scalar"(%26588, %26589) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26591 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %26592 = "torch.aten.pow.Scalar"(%26591, %26590) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26593 = "torch.aten.reciprocal"(%26592) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26594 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %26595 = "torch.aten.mul.Scalar"(%26593, %26594) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %26596 = "torch.aten.reciprocal"(%26595) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26597 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %26598 = "torch.aten.mul.Scalar"(%26596, %26597) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %26599 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %26600 = "torch.aten.gt.Scalar"(%26598, %26599) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26601 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26602 = "torch.aten.div.Scalar"(%26595, %26601) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26603 = "torch.aten.where.self"(%26600, %26602, %26595) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26604 = "torch.aten.reciprocal"(%26598) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26605 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %26606 = "torch.aten.mul.Scalar"(%26604, %26605) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26607 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26608 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26609 = "torch.aten.sub.Scalar"(%26606, %26607, %26608) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %26610 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26611 = "torch.aten.div.Scalar"(%26609, %26610) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26612 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26613 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26614 = "torch.aten.rsub.Scalar"(%26611, %26612, %26613) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %26615 = "torch.aten.mul.Tensor"(%26614, %26603) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26616 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26617 = "torch.aten.div.Scalar"(%26615, %26616) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26618 = "torch.aten.mul.Tensor"(%26611, %26603) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26619 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26620 = "torch.aten.add.Tensor"(%26617, %26618, %26619) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26621 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %26622 = "torch.aten.lt.Scalar"(%26598, %26621) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26623 = "torch.aten.bitwise_not"(%26622) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26624 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %26625 = "torch.aten.gt.Scalar"(%26598, %26624) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26626 = "torch.aten.bitwise_not"(%26625) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26627 = "torch.aten.mul.Tensor"(%26623, %26626) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26628 = "torch.aten.where.self"(%26627, %26620, %26603) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26629 = "torch.prim.ListConstruct"(%26628, %26628) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %26630 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26631 = "torch.aten.cat"(%26629, %26630) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %26632 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26633 = "torch.prims.convert_element_type"(%26578, %26632) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %26634 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26635 = "torch.prims.convert_element_type"(%26631, %26634) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %26636 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %26637 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26638 = "torch.prim.ListConstruct"(%26636, %26637) : (!torch.int, !torch.int) -> !torch.list<int>
    %26639 = "torch.aten.view"(%26633, %26638) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %26640 = "torch.aten.mul.Tensor"(%26639, %26635) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26641 = "torch.aten.cos"(%26640) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26642 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26643 = "torch.prims.convert_element_type"(%26641, %26642) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %26644 = "torch.aten.sin"(%26640) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26645 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26646 = "torch.prims.convert_element_type"(%26644, %26645) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %26647 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26648 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26649 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26650 = "torch.aten.slice.Tensor"(%26643, %26647, %26648, %18481, %26649) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26650, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26651 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26652 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26653 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26654 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26655 = "torch.aten.slice.Tensor"(%26650, %26651, %26652, %26653, %26654) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26655, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26656 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26657 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26658 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26659 = "torch.aten.slice.Tensor"(%26646, %26656, %26657, %18481, %26658) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26659, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26660 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26661 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26662 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26663 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26664 = "torch.aten.slice.Tensor"(%26659, %26660, %26661, %26662, %26663) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26664, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26665 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26666 = "torch.aten.unsqueeze"(%26655, %26665) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26666, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26667 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26668 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26669 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26670 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26671 = "torch.aten.slice.Tensor"(%26666, %26667, %26668, %26669, %26670) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26671, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26672 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26673 = "torch.aten.unsqueeze"(%26671, %26672) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26673, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26674 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26675 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26676 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26677 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26678 = "torch.aten.slice.Tensor"(%26673, %26674, %26675, %26676, %26677) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26678, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26679 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26680 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26681 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26682 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26683 = "torch.prim.ListConstruct"(%26679, %26680, %26681, %26682) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26684 = "torch.aten.repeat"(%26678, %26683) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26684, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26685 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26686 = "torch.aten.unsqueeze"(%26664, %26685) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26686, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26687 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26688 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26689 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26690 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26691 = "torch.aten.slice.Tensor"(%26686, %26687, %26688, %26689, %26690) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26691, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26692 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26693 = "torch.aten.unsqueeze"(%26691, %26692) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26693, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26694 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26695 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26696 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26697 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26698 = "torch.aten.slice.Tensor"(%26693, %26694, %26695, %26696, %26697) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26698, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26699 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26700 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26701 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26702 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26703 = "torch.prim.ListConstruct"(%26699, %26700, %26701, %26702) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26704 = "torch.aten.repeat"(%26698, %26703) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26704, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26705 = "torch.aten.mul.Tensor"(%26562, %26684) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26705, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26706 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26707 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26708 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26709 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26710 = "torch.aten.slice.Tensor"(%26562, %26706, %26707, %26708, %26709) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26710, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26711 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26712 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26713 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26714 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26715 = "torch.aten.slice.Tensor"(%26562, %26711, %26712, %26713, %26714) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26715, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26716 = "torch.aten.neg"(%26715) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26716, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26717 = "torch.prim.ListConstruct"(%26716, %26710) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %26718 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26719 = "torch.aten.cat"(%26717, %26718) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26719, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26720 = "torch.aten.mul.Tensor"(%26719, %26704) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26720, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26721 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26722 = "torch.aten.add.Tensor"(%26705, %26720, %26721) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26722, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26723 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %26724 = "torch.constant.none"() : () -> !torch.none
    %26725 = "torch.constant.none"() : () -> !torch.none
    %26726 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %26727 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26728 = "torch.aten.arange"(%26723, %26724, %26725, %26726, %26727) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %26729 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26730 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26731 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26732 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26733 = "torch.constant.none"() : () -> !torch.none
    %26734 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %26735 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26736 = "torch.aten.arange.start_step"(%26729, %26730, %26731, %26732, %26733, %26734, %26735) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %26737 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26738 = "torch.prims.convert_element_type"(%26736, %26737) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %26739 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26740 = "torch.aten.div.Scalar"(%26738, %26739) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26741 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %26742 = "torch.aten.pow.Scalar"(%26741, %26740) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26743 = "torch.aten.reciprocal"(%26742) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26744 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %26745 = "torch.aten.mul.Scalar"(%26743, %26744) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %26746 = "torch.aten.reciprocal"(%26745) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26747 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %26748 = "torch.aten.mul.Scalar"(%26746, %26747) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %26749 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %26750 = "torch.aten.gt.Scalar"(%26748, %26749) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26751 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26752 = "torch.aten.div.Scalar"(%26745, %26751) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26753 = "torch.aten.where.self"(%26750, %26752, %26745) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26754 = "torch.aten.reciprocal"(%26748) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26755 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %26756 = "torch.aten.mul.Scalar"(%26754, %26755) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26757 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26758 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26759 = "torch.aten.sub.Scalar"(%26756, %26757, %26758) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %26760 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26761 = "torch.aten.div.Scalar"(%26759, %26760) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26763 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26764 = "torch.aten.rsub.Scalar"(%26761, %26762, %26763) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %26765 = "torch.aten.mul.Tensor"(%26764, %26753) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26766 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26767 = "torch.aten.div.Scalar"(%26765, %26766) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26768 = "torch.aten.mul.Tensor"(%26761, %26753) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26769 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26770 = "torch.aten.add.Tensor"(%26767, %26768, %26769) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %26771 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %26772 = "torch.aten.lt.Scalar"(%26748, %26771) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26773 = "torch.aten.bitwise_not"(%26772) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26774 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %26775 = "torch.aten.gt.Scalar"(%26748, %26774) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %26776 = "torch.aten.bitwise_not"(%26775) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26777 = "torch.aten.mul.Tensor"(%26773, %26776) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %26778 = "torch.aten.where.self"(%26777, %26770, %26753) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %26779 = "torch.prim.ListConstruct"(%26778, %26778) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %26780 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26781 = "torch.aten.cat"(%26779, %26780) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %26782 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26783 = "torch.prims.convert_element_type"(%26728, %26782) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %26784 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %26785 = "torch.prims.convert_element_type"(%26781, %26784) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %26786 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %26787 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26788 = "torch.prim.ListConstruct"(%26786, %26787) : (!torch.int, !torch.int) -> !torch.list<int>
    %26789 = "torch.aten.view"(%26783, %26788) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %26790 = "torch.aten.mul.Tensor"(%26789, %26785) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26791 = "torch.aten.cos"(%26790) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26792 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26793 = "torch.prims.convert_element_type"(%26791, %26792) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %26794 = "torch.aten.sin"(%26790) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %26795 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %26796 = "torch.prims.convert_element_type"(%26794, %26795) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %26797 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26798 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26799 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26800 = "torch.aten.slice.Tensor"(%26793, %26797, %26798, %18481, %26799) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26800, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26801 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26802 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26803 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26804 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26805 = "torch.aten.slice.Tensor"(%26800, %26801, %26802, %26803, %26804) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26805, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26806 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26807 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26808 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26809 = "torch.aten.slice.Tensor"(%26796, %26806, %26807, %18481, %26808) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26809, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26810 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26811 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26812 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26813 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26814 = "torch.aten.slice.Tensor"(%26809, %26810, %26811, %26812, %26813) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%26814, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %26815 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26816 = "torch.aten.unsqueeze"(%26805, %26815) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26816, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26817 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26818 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26819 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26820 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26821 = "torch.aten.slice.Tensor"(%26816, %26817, %26818, %26819, %26820) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26821, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26822 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26823 = "torch.aten.unsqueeze"(%26821, %26822) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26823, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26824 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26825 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26826 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26827 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26828 = "torch.aten.slice.Tensor"(%26823, %26824, %26825, %26826, %26827) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26828, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26829 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26830 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26831 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26832 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26833 = "torch.prim.ListConstruct"(%26829, %26830, %26831, %26832) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26834 = "torch.aten.repeat"(%26828, %26833) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26834, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26835 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26836 = "torch.aten.unsqueeze"(%26814, %26835) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26836, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26837 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26838 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26839 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26840 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26841 = "torch.aten.slice.Tensor"(%26836, %26837, %26838, %26839, %26840) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%26841, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %26842 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26843 = "torch.aten.unsqueeze"(%26841, %26842) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26843, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26844 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26845 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26846 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26847 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26848 = "torch.aten.slice.Tensor"(%26843, %26844, %26845, %26846, %26847) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26848, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %26849 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26850 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26851 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26852 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26853 = "torch.prim.ListConstruct"(%26849, %26850, %26851, %26852) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26854 = "torch.aten.repeat"(%26848, %26853) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%26854, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %26855 = "torch.aten.mul.Tensor"(%26567, %26834) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26855, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26856 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26857 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26858 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26859 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26860 = "torch.aten.slice.Tensor"(%26567, %26856, %26857, %26858, %26859) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26860, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26861 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %26862 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26863 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %26864 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26865 = "torch.aten.slice.Tensor"(%26567, %26861, %26862, %26863, %26864) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26865, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26866 = "torch.aten.neg"(%26865) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26866, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %26867 = "torch.prim.ListConstruct"(%26866, %26860) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %26868 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %26869 = "torch.aten.cat"(%26867, %26868) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26869, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26870 = "torch.aten.mul.Tensor"(%26869, %26854) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26870, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26871 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26872 = "torch.aten.add.Tensor"(%26855, %26870, %26871) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26872, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26873 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %26874 = "torch.aten.mul.Scalar"(%arg69, %26873) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%26874, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %26875 = "torch.constant.int"() <{value = 24 : i64}> : () -> !torch.int
    %26876 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26877 = "torch.aten.add.Scalar"(%26874, %26875, %26876) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%26877, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %26878 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26879 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26880 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26881 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26882 = "torch.prim.ListConstruct"(%26878, %18477, %26879, %26880, %26881) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26883 = "torch.aten.view"(%26872, %26882) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26883, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26884 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26885 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26886 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26887 = "torch.prim.ListConstruct"(%19011, %26884, %26885, %26886) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26888 = "torch.aten.view"(%26883, %26887) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26888, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26889 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %26890 = "torch.aten.view"(%26877, %26889) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%26890, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %26891 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26892 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26893 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26894 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26895 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26896 = "torch.prim.ListConstruct"(%18479, %26891, %26892, %26893, %26894, %26895) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26897 = "torch.aten.view"(%26299, %26896) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26897, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26898 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26899 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26900 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26901 = "torch.prim.ListConstruct"(%18993, %26898, %26899, %26900) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26902 = "torch.aten.view"(%26897, %26901) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26902, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26903 = "torch.prim.ListConstruct"(%26890) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %26904 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26905 = "torch.aten.index_put"(%26902, %26903, %26888, %26904) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26905, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26906 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26907 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26908 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26909 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26910 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26911 = "torch.prim.ListConstruct"(%18479, %26906, %26907, %26908, %26909, %26910) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26912 = "torch.aten.view"(%26905, %26911) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26912, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26913 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %26914 = "torch.prim.ListConstruct"(%18479, %26913) : (!torch.int, !torch.int) -> !torch.list<int>
    %26915 = "torch.aten.view"(%26912, %26914) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26915, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %26916 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26917 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26918 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26919 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26920 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26921 = "torch.prim.ListConstruct"(%18479, %26916, %26917, %26918, %26919, %26920) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26922 = "torch.aten.view"(%26915, %26921) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26922, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26923 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26924 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26925 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26926 = "torch.prim.ListConstruct"(%18993, %26923, %26924, %26925) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26927 = "torch.aten.view"(%26922, %26926) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26927, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26928 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26929 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26930 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26931 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26932 = "torch.prim.ListConstruct"(%26928, %18477, %26929, %26930, %26931) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26933 = "torch.aten.view"(%26572, %26932) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26933, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26934 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26935 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26936 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26937 = "torch.prim.ListConstruct"(%19011, %26934, %26935, %26936) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26938 = "torch.aten.view"(%26933, %26937) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26938, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26939 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26940 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26941 = "torch.aten.add.Scalar"(%26877, %26939, %26940) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%26941, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %26942 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %26943 = "torch.aten.view"(%26941, %26942) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%26943, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %26944 = "torch.prim.ListConstruct"(%26943) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %26945 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26946 = "torch.aten.index_put"(%26927, %26944, %26938, %26945) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26946, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26947 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26948 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26949 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26950 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26951 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26952 = "torch.prim.ListConstruct"(%18479, %26947, %26948, %26949, %26950, %26951) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26953 = "torch.aten.view"(%26946, %26952) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26953, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26954 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %26955 = "torch.prim.ListConstruct"(%18479, %26954) : (!torch.int, !torch.int) -> !torch.list<int>
    %26956 = "torch.aten.view"(%26953, %26955) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26956, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %26957 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %26958 = "torch.aten.unsqueeze"(%26872, %26957) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26958, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26960 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26961 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26962 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26963 = "torch.prim.ListConstruct"(%26959, %18481, %26960, %26961, %26962) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26964 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26965 = "torch.aten.expand"(%26958, %26963, %26964) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26965, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26966 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26967 = "torch.aten.clone"(%26965, %26966) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26967, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26968 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26969 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26970 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26971 = "torch.prim.ListConstruct"(%26968, %18481, %26969, %26970) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26972 = "torch.aten._unsafe_view"(%26967, %26971) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26972, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26973 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %26974 = "torch.aten.unsqueeze"(%26572, %26973) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26974, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26975 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26976 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %26977 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26978 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26979 = "torch.prim.ListConstruct"(%26975, %18481, %26976, %26977, %26978) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26980 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %26981 = "torch.aten.expand"(%26974, %26979, %26980) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26981, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26982 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26983 = "torch.aten.clone"(%26981, %26982) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26983, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26984 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %26985 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %26986 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %26987 = "torch.prim.ListConstruct"(%26984, %18481, %26985, %26986) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %26988 = "torch.aten._unsafe_view"(%26983, %26987) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26988, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26989 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26990 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26991 = "torch.aten.transpose.int"(%26722, %26989, %26990) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26991, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26992 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26993 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26994 = "torch.aten.transpose.int"(%26972, %26992, %26993) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26994, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26995 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %26996 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %26997 = "torch.aten.transpose.int"(%26988, %26995, %26996) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26997, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %26998 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %26999 = "torch.aten.squeeze.dim"(%18570, %26998) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%26999, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %27000 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27001 = "torch.aten.squeeze.dim"(%26999, %27000) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27001, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %27002 = "torch_c.to_builtin_tensor"(%26991) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %27003 = "torch_c.to_builtin_tensor"(%26994) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %27004 = "torch_c.to_builtin_tensor"(%26997) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %27005 = "torch_c.to_builtin_tensor"(%27001) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %27006 = "tensor.cast"(%27005) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %27007 = "torch_c.to_builtin_tensor"(%17689) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %27008 = "util.call"(%27002, %27003, %27004, %27007, %27006) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %27009 = "torch_c.from_builtin_tensor"(%27008) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%27009, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %27010 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27011 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27012 = "torch.aten.transpose.int"(%27009, %27010, %27011) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%27012, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %27013 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27014 = "torch.aten.clone"(%27012, %27013) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%27014, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %27015 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27016 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27017 = "torch.prim.ListConstruct"(%27015, %18481, %27016) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27018 = "torch.aten._unsafe_view"(%27014, %27017) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27018, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27019 = "torch.aten.div.Tensor"(%27018, %17691) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27019, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27020 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27021 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27022 = "torch.aten.clamp"(%27019, %27020, %27021) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27022, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27023 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27024 = "torch.prims.convert_element_type"(%27022, %27023) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27024, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27025 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27026 = "torch.aten.unsqueeze"(%17693, %27025) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %27027 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27028 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27029 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27030 = "torch.prim.ListConstruct"(%27027, %27028, %27029) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27031 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27032 = "torch.aten.expand"(%27026, %27030, %27031) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %27033 = "torch_c.to_builtin_tensor"(%27024) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27034 = "torch_c.to_builtin_tensor"(%27032) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %27035 = "util.call"(%27033, %27034) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %27036 = "torch_c.from_builtin_tensor"(%27035) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27036, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27037 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27038 = "torch.prims.convert_element_type"(%27036, %27037) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27038, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27039 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27040 = "torch.aten.add.Tensor"(%26466, %27038, %27039) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27040, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27041 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27042 = "torch.prims.convert_element_type"(%27040, %27041) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27042, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27043 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27044 = "torch.aten.pow.Tensor_Scalar"(%27042, %27043) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27044, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27045 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27046 = "torch.prim.ListConstruct"(%27045) : (!torch.int) -> !torch.list<int>
    %27047 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %27048 = "torch.constant.none"() : () -> !torch.none
    %27049 = "torch.aten.mean.dim"(%27044, %27046, %27047, %27048) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27049, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27050 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %27051 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27052 = "torch.aten.add.Scalar"(%27049, %27050, %27051) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27052, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27053 = "torch.aten.rsqrt"(%27052) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27053, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27054 = "torch.aten.mul.Tensor"(%27042, %27053) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27054, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27055 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27056 = "torch.prims.convert_element_type"(%27054, %27055) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27056, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27057 = "torch.aten.mul.Tensor"(%17695, %27056) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27057, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27058 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27059 = "torch.prims.convert_element_type"(%27057, %27058) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27059, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27060 = "torch.aten.div.Tensor"(%27059, %17697) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27060, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27061 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27062 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27063 = "torch.aten.clamp"(%27060, %27061, %27062) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27063, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27064 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27065 = "torch.prims.convert_element_type"(%27063, %27064) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27065, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27066 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27067 = "torch.aten.unsqueeze"(%17699, %27066) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %27068 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27069 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %27070 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27071 = "torch.prim.ListConstruct"(%27068, %27069, %27070) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27072 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27073 = "torch.aten.expand"(%27067, %27071, %27072) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %27074 = "torch_c.to_builtin_tensor"(%27065) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27075 = "torch_c.to_builtin_tensor"(%27073) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %27076 = "util.call"(%27074, %27075) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %27077 = "torch_c.from_builtin_tensor"(%27076) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%27077, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %27078 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27079 = "torch.prims.convert_element_type"(%27077, %27078) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27079, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27080 = "torch.aten.silu"(%27079) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27080, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27081 = "torch.aten.div.Tensor"(%27059, %17701) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27081, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27082 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27083 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27084 = "torch.aten.clamp"(%27081, %27082, %27083) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27084, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27085 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27086 = "torch.prims.convert_element_type"(%27084, %27085) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27086, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27087 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27088 = "torch.aten.unsqueeze"(%17703, %27087) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %27089 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27090 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %27091 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27092 = "torch.prim.ListConstruct"(%27089, %27090, %27091) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27093 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27094 = "torch.aten.expand"(%27088, %27092, %27093) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %27095 = "torch_c.to_builtin_tensor"(%27086) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27096 = "torch_c.to_builtin_tensor"(%27094) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %27097 = "util.call"(%27095, %27096) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %27098 = "torch_c.from_builtin_tensor"(%27097) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%27098, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %27099 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27100 = "torch.prims.convert_element_type"(%27098, %27099) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27100, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27101 = "torch.aten.mul.Tensor"(%27080, %27100) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27101, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27102 = "torch.aten.div.Tensor"(%27101, %17705) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27102, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27103 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27104 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27105 = "torch.aten.clamp"(%27102, %27103, %27104) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27105, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27106 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27107 = "torch.prims.convert_element_type"(%27105, %27106) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27107, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %27108 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27109 = "torch.aten.unsqueeze"(%17707, %27108) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %27110 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27111 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27112 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %27113 = "torch.prim.ListConstruct"(%27110, %27111, %27112) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27114 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27115 = "torch.aten.expand"(%27109, %27113, %27114) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %27116 = "torch_c.to_builtin_tensor"(%27107) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %27117 = "torch_c.to_builtin_tensor"(%27115) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %27118 = "util.call"(%27116, %27117) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %27119 = "torch_c.from_builtin_tensor"(%27118) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27119, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27120 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27121 = "torch.prims.convert_element_type"(%27119, %27120) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27121, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27122 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27123 = "torch.aten.add.Tensor"(%27040, %27121, %27122) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27123, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27124 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27125 = "torch.prims.convert_element_type"(%27123, %27124) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27125, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27126 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27127 = "torch.aten.pow.Tensor_Scalar"(%27125, %27126) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27127, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27128 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27129 = "torch.prim.ListConstruct"(%27128) : (!torch.int) -> !torch.list<int>
    %27130 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %27131 = "torch.constant.none"() : () -> !torch.none
    %27132 = "torch.aten.mean.dim"(%27127, %27129, %27130, %27131) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27132, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27133 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %27134 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27135 = "torch.aten.add.Scalar"(%27132, %27133, %27134) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27135, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27136 = "torch.aten.rsqrt"(%27135) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27136, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27137 = "torch.aten.mul.Tensor"(%27125, %27136) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27137, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27138 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27139 = "torch.prims.convert_element_type"(%27137, %27138) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27139, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27140 = "torch.aten.mul.Tensor"(%17709, %27139) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27140, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27141 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27142 = "torch.prims.convert_element_type"(%27140, %27141) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27142, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27143 = "torch.aten.div.Tensor"(%27142, %17711) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27143, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27144 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27145 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27146 = "torch.aten.clamp"(%27143, %27144, %27145) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27146, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27147 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27148 = "torch.prims.convert_element_type"(%27146, %27147) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27148, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27149 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27150 = "torch.aten.unsqueeze"(%17713, %27149) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %27151 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27152 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27153 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27154 = "torch.prim.ListConstruct"(%27151, %27152, %27153) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27155 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27156 = "torch.aten.expand"(%27150, %27154, %27155) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %27157 = "torch_c.to_builtin_tensor"(%27148) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27158 = "torch_c.to_builtin_tensor"(%27156) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %27159 = "util.call"(%27157, %27158) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %27160 = "torch_c.from_builtin_tensor"(%27159) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27160, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27161 = "torch.aten.div.Tensor"(%27160, %17715) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27161, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27162 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27163 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27164 = "torch.aten.clamp"(%27161, %27162, %27163) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27164, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27165 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27166 = "torch.prims.convert_element_type"(%27164, %27165) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27166, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27167 = "torch.aten.div.Tensor"(%27142, %17717) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27167, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27168 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27169 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27170 = "torch.aten.clamp"(%27167, %27168, %27169) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27170, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27171 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27172 = "torch.prims.convert_element_type"(%27170, %27171) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27172, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27173 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27174 = "torch.aten.unsqueeze"(%17719, %27173) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %27175 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27176 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %27177 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27178 = "torch.prim.ListConstruct"(%27175, %27176, %27177) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27179 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27180 = "torch.aten.expand"(%27174, %27178, %27179) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %27181 = "torch_c.to_builtin_tensor"(%27172) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27182 = "torch_c.to_builtin_tensor"(%27180) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %27183 = "util.call"(%27181, %27182) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %27184 = "torch_c.from_builtin_tensor"(%27183) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27184, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27185 = "torch.aten.div.Tensor"(%27184, %17721) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27185, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27186 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27187 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27188 = "torch.aten.clamp"(%27185, %27186, %27187) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27188, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27189 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27190 = "torch.prims.convert_element_type"(%27188, %27189) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27190, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %27191 = "torch.aten.div.Tensor"(%27142, %17723) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27191, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27192 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27193 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27194 = "torch.aten.clamp"(%27191, %27192, %27193) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27194, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27195 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27196 = "torch.prims.convert_element_type"(%27194, %27195) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27196, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27197 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27198 = "torch.aten.unsqueeze"(%17725, %27197) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %27199 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27200 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %27201 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27202 = "torch.prim.ListConstruct"(%27199, %27200, %27201) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27203 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27204 = "torch.aten.expand"(%27198, %27202, %27203) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %27205 = "torch_c.to_builtin_tensor"(%27196) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27206 = "torch_c.to_builtin_tensor"(%27204) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %27207 = "util.call"(%27205, %27206) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %27208 = "torch_c.from_builtin_tensor"(%27207) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27208, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27209 = "torch.aten.div.Tensor"(%27208, %17727) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27209, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27210 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27211 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27212 = "torch.aten.clamp"(%27209, %27210, %27211) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27212, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27213 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27214 = "torch.prims.convert_element_type"(%27212, %27213) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27214, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %27215 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27216 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27217 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27218 = "torch.prim.ListConstruct"(%27215, %18481, %27216, %27217) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27219 = "torch.aten.view"(%27166, %27218) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27219, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27220 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27221 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27222 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27223 = "torch.prim.ListConstruct"(%27220, %18481, %27221, %27222) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27224 = "torch.aten.view"(%27190, %27223) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27224, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27225 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27226 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27227 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27228 = "torch.prim.ListConstruct"(%27225, %18481, %27226, %27227) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27229 = "torch.aten.view"(%27214, %27228) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27229, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27230 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %27231 = "torch.constant.none"() : () -> !torch.none
    %27232 = "torch.constant.none"() : () -> !torch.none
    %27233 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %27234 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27235 = "torch.aten.arange"(%27230, %27231, %27232, %27233, %27234) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %27236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27237 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27238 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27239 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27240 = "torch.constant.none"() : () -> !torch.none
    %27241 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %27242 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27243 = "torch.aten.arange.start_step"(%27236, %27237, %27238, %27239, %27240, %27241, %27242) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %27244 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27245 = "torch.prims.convert_element_type"(%27243, %27244) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %27246 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27247 = "torch.aten.div.Scalar"(%27245, %27246) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27248 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %27249 = "torch.aten.pow.Scalar"(%27248, %27247) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27250 = "torch.aten.reciprocal"(%27249) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27251 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %27252 = "torch.aten.mul.Scalar"(%27250, %27251) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %27253 = "torch.aten.reciprocal"(%27252) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27254 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %27255 = "torch.aten.mul.Scalar"(%27253, %27254) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %27256 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %27257 = "torch.aten.gt.Scalar"(%27255, %27256) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27258 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27259 = "torch.aten.div.Scalar"(%27252, %27258) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27260 = "torch.aten.where.self"(%27257, %27259, %27252) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27261 = "torch.aten.reciprocal"(%27255) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27262 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %27263 = "torch.aten.mul.Scalar"(%27261, %27262) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27264 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27265 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27266 = "torch.aten.sub.Scalar"(%27263, %27264, %27265) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %27267 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27268 = "torch.aten.div.Scalar"(%27266, %27267) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27269 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27271 = "torch.aten.rsub.Scalar"(%27268, %27269, %27270) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %27272 = "torch.aten.mul.Tensor"(%27271, %27260) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27273 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27274 = "torch.aten.div.Scalar"(%27272, %27273) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27275 = "torch.aten.mul.Tensor"(%27268, %27260) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27276 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27277 = "torch.aten.add.Tensor"(%27274, %27275, %27276) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27278 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %27279 = "torch.aten.lt.Scalar"(%27255, %27278) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27280 = "torch.aten.bitwise_not"(%27279) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27281 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %27282 = "torch.aten.gt.Scalar"(%27255, %27281) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27283 = "torch.aten.bitwise_not"(%27282) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27284 = "torch.aten.mul.Tensor"(%27280, %27283) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27285 = "torch.aten.where.self"(%27284, %27277, %27260) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27286 = "torch.prim.ListConstruct"(%27285, %27285) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %27287 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27288 = "torch.aten.cat"(%27286, %27287) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %27289 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27290 = "torch.prims.convert_element_type"(%27235, %27289) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %27291 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27292 = "torch.prims.convert_element_type"(%27288, %27291) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %27293 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %27294 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27295 = "torch.prim.ListConstruct"(%27293, %27294) : (!torch.int, !torch.int) -> !torch.list<int>
    %27296 = "torch.aten.view"(%27290, %27295) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %27297 = "torch.aten.mul.Tensor"(%27296, %27292) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27298 = "torch.aten.cos"(%27297) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27299 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27300 = "torch.prims.convert_element_type"(%27298, %27299) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %27301 = "torch.aten.sin"(%27297) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27302 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27303 = "torch.prims.convert_element_type"(%27301, %27302) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %27304 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27305 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27306 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27307 = "torch.aten.slice.Tensor"(%27300, %27304, %27305, %18481, %27306) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27307, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27308 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27309 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27310 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27311 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27312 = "torch.aten.slice.Tensor"(%27307, %27308, %27309, %27310, %27311) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27312, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27313 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27314 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27315 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27316 = "torch.aten.slice.Tensor"(%27303, %27313, %27314, %18481, %27315) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27316, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27317 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27318 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27319 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27320 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27321 = "torch.aten.slice.Tensor"(%27316, %27317, %27318, %27319, %27320) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27321, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27322 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27323 = "torch.aten.unsqueeze"(%27312, %27322) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27323, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27324 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27325 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27326 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27327 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27328 = "torch.aten.slice.Tensor"(%27323, %27324, %27325, %27326, %27327) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27328, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27329 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27330 = "torch.aten.unsqueeze"(%27328, %27329) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27330, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27331 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27332 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27333 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27334 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27335 = "torch.aten.slice.Tensor"(%27330, %27331, %27332, %27333, %27334) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27335, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27336 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27337 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27338 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27339 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27340 = "torch.prim.ListConstruct"(%27336, %27337, %27338, %27339) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27341 = "torch.aten.repeat"(%27335, %27340) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27341, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %27342 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27343 = "torch.aten.unsqueeze"(%27321, %27342) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27343, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27344 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27345 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27346 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27347 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27348 = "torch.aten.slice.Tensor"(%27343, %27344, %27345, %27346, %27347) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27348, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27349 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27350 = "torch.aten.unsqueeze"(%27348, %27349) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27350, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27351 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27352 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27353 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27354 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27355 = "torch.aten.slice.Tensor"(%27350, %27351, %27352, %27353, %27354) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27355, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27356 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27357 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27358 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27359 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27360 = "torch.prim.ListConstruct"(%27356, %27357, %27358, %27359) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27361 = "torch.aten.repeat"(%27355, %27360) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27361, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %27362 = "torch.aten.mul.Tensor"(%27219, %27341) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27362, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27363 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27364 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27365 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %27366 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27367 = "torch.aten.slice.Tensor"(%27219, %27363, %27364, %27365, %27366) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27367, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %27368 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27369 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %27370 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27371 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27372 = "torch.aten.slice.Tensor"(%27219, %27368, %27369, %27370, %27371) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27372, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %27373 = "torch.aten.neg"(%27372) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27373, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %27374 = "torch.prim.ListConstruct"(%27373, %27367) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %27375 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27376 = "torch.aten.cat"(%27374, %27375) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27376, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27377 = "torch.aten.mul.Tensor"(%27376, %27361) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27377, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27378 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27379 = "torch.aten.add.Tensor"(%27362, %27377, %27378) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27379, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27380 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %27381 = "torch.constant.none"() : () -> !torch.none
    %27382 = "torch.constant.none"() : () -> !torch.none
    %27383 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %27384 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27385 = "torch.aten.arange"(%27380, %27381, %27382, %27383, %27384) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %27386 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27387 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27388 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27389 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27390 = "torch.constant.none"() : () -> !torch.none
    %27391 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %27392 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27393 = "torch.aten.arange.start_step"(%27386, %27387, %27388, %27389, %27390, %27391, %27392) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %27394 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27395 = "torch.prims.convert_element_type"(%27393, %27394) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %27396 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27397 = "torch.aten.div.Scalar"(%27395, %27396) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27398 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %27399 = "torch.aten.pow.Scalar"(%27398, %27397) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27400 = "torch.aten.reciprocal"(%27399) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27401 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %27402 = "torch.aten.mul.Scalar"(%27400, %27401) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %27403 = "torch.aten.reciprocal"(%27402) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27404 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %27405 = "torch.aten.mul.Scalar"(%27403, %27404) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %27406 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %27407 = "torch.aten.gt.Scalar"(%27405, %27406) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27408 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27409 = "torch.aten.div.Scalar"(%27402, %27408) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27410 = "torch.aten.where.self"(%27407, %27409, %27402) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27411 = "torch.aten.reciprocal"(%27405) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27412 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %27413 = "torch.aten.mul.Scalar"(%27411, %27412) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27414 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27415 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27416 = "torch.aten.sub.Scalar"(%27413, %27414, %27415) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %27417 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27418 = "torch.aten.div.Scalar"(%27416, %27417) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27419 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27420 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27421 = "torch.aten.rsub.Scalar"(%27418, %27419, %27420) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %27422 = "torch.aten.mul.Tensor"(%27421, %27410) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27423 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27424 = "torch.aten.div.Scalar"(%27422, %27423) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27425 = "torch.aten.mul.Tensor"(%27418, %27410) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27426 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27427 = "torch.aten.add.Tensor"(%27424, %27425, %27426) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27428 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %27429 = "torch.aten.lt.Scalar"(%27405, %27428) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27430 = "torch.aten.bitwise_not"(%27429) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27431 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %27432 = "torch.aten.gt.Scalar"(%27405, %27431) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27433 = "torch.aten.bitwise_not"(%27432) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27434 = "torch.aten.mul.Tensor"(%27430, %27433) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27435 = "torch.aten.where.self"(%27434, %27427, %27410) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27436 = "torch.prim.ListConstruct"(%27435, %27435) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %27437 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27438 = "torch.aten.cat"(%27436, %27437) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %27439 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27440 = "torch.prims.convert_element_type"(%27385, %27439) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %27441 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27442 = "torch.prims.convert_element_type"(%27438, %27441) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %27443 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %27444 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27445 = "torch.prim.ListConstruct"(%27443, %27444) : (!torch.int, !torch.int) -> !torch.list<int>
    %27446 = "torch.aten.view"(%27440, %27445) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %27447 = "torch.aten.mul.Tensor"(%27446, %27442) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27448 = "torch.aten.cos"(%27447) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27449 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27450 = "torch.prims.convert_element_type"(%27448, %27449) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %27451 = "torch.aten.sin"(%27447) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27452 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27453 = "torch.prims.convert_element_type"(%27451, %27452) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %27454 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27455 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27456 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27457 = "torch.aten.slice.Tensor"(%27450, %27454, %27455, %18481, %27456) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27457, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27458 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27459 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27460 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27461 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27462 = "torch.aten.slice.Tensor"(%27457, %27458, %27459, %27460, %27461) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27462, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27463 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27464 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27465 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27466 = "torch.aten.slice.Tensor"(%27453, %27463, %27464, %18481, %27465) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27466, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27467 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27468 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27469 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27470 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27471 = "torch.aten.slice.Tensor"(%27466, %27467, %27468, %27469, %27470) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27471, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27472 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27473 = "torch.aten.unsqueeze"(%27462, %27472) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27473, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27474 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27475 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27476 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27477 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27478 = "torch.aten.slice.Tensor"(%27473, %27474, %27475, %27476, %27477) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27478, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27479 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27480 = "torch.aten.unsqueeze"(%27478, %27479) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27480, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27481 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27482 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27483 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27484 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27485 = "torch.aten.slice.Tensor"(%27480, %27481, %27482, %27483, %27484) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27485, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27486 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27487 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27488 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27489 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27490 = "torch.prim.ListConstruct"(%27486, %27487, %27488, %27489) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27491 = "torch.aten.repeat"(%27485, %27490) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27491, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %27492 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27493 = "torch.aten.unsqueeze"(%27471, %27492) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27493, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27494 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27495 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27496 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27497 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27498 = "torch.aten.slice.Tensor"(%27493, %27494, %27495, %27496, %27497) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27498, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27499 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27500 = "torch.aten.unsqueeze"(%27498, %27499) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27500, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27501 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27502 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27503 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27504 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27505 = "torch.aten.slice.Tensor"(%27500, %27501, %27502, %27503, %27504) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27505, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27506 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27507 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27508 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27509 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27510 = "torch.prim.ListConstruct"(%27506, %27507, %27508, %27509) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27511 = "torch.aten.repeat"(%27505, %27510) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27511, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %27512 = "torch.aten.mul.Tensor"(%27224, %27491) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27512, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27513 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27514 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27515 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %27516 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27517 = "torch.aten.slice.Tensor"(%27224, %27513, %27514, %27515, %27516) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27517, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %27518 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27519 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %27520 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27521 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27522 = "torch.aten.slice.Tensor"(%27224, %27518, %27519, %27520, %27521) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27522, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %27523 = "torch.aten.neg"(%27522) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27523, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %27524 = "torch.prim.ListConstruct"(%27523, %27517) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %27525 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27526 = "torch.aten.cat"(%27524, %27525) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27526, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27527 = "torch.aten.mul.Tensor"(%27526, %27511) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27527, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27528 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27529 = "torch.aten.add.Tensor"(%27512, %27527, %27528) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27529, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27530 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %27531 = "torch.aten.mul.Scalar"(%arg69, %27530) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%27531, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %27532 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27534 = "torch.aten.add.Scalar"(%27531, %27532, %27533) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%27534, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %27535 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27536 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27537 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27538 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27539 = "torch.prim.ListConstruct"(%27535, %18477, %27536, %27537, %27538) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27540 = "torch.aten.view"(%27529, %27539) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27540, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27541 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27542 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27543 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27544 = "torch.prim.ListConstruct"(%19011, %27541, %27542, %27543) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27545 = "torch.aten.view"(%27540, %27544) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27545, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27546 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %27547 = "torch.aten.view"(%27534, %27546) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%27547, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %27548 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27549 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27550 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27551 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27552 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27553 = "torch.prim.ListConstruct"(%18479, %27548, %27549, %27550, %27551, %27552) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27554 = "torch.aten.view"(%26956, %27553) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27554, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27555 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27556 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27557 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27558 = "torch.prim.ListConstruct"(%18993, %27555, %27556, %27557) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27559 = "torch.aten.view"(%27554, %27558) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27559, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27560 = "torch.prim.ListConstruct"(%27547) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %27561 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27562 = "torch.aten.index_put"(%27559, %27560, %27545, %27561) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27562, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27563 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27564 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27565 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27566 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27567 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27568 = "torch.prim.ListConstruct"(%18479, %27563, %27564, %27565, %27566, %27567) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27569 = "torch.aten.view"(%27562, %27568) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27569, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27570 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %27571 = "torch.prim.ListConstruct"(%18479, %27570) : (!torch.int, !torch.int) -> !torch.list<int>
    %27572 = "torch.aten.view"(%27569, %27571) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27572, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %27573 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27574 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27575 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27576 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27577 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27578 = "torch.prim.ListConstruct"(%18479, %27573, %27574, %27575, %27576, %27577) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27579 = "torch.aten.view"(%27572, %27578) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27579, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27580 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27581 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27582 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27583 = "torch.prim.ListConstruct"(%18993, %27580, %27581, %27582) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27584 = "torch.aten.view"(%27579, %27583) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27584, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27585 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27586 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27587 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27588 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27589 = "torch.prim.ListConstruct"(%27585, %18477, %27586, %27587, %27588) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27590 = "torch.aten.view"(%27229, %27589) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27590, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27591 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27592 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27593 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27594 = "torch.prim.ListConstruct"(%19011, %27591, %27592, %27593) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27595 = "torch.aten.view"(%27590, %27594) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27595, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27596 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27597 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27598 = "torch.aten.add.Scalar"(%27534, %27596, %27597) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%27598, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %27599 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %27600 = "torch.aten.view"(%27598, %27599) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%27600, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %27601 = "torch.prim.ListConstruct"(%27600) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %27602 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27603 = "torch.aten.index_put"(%27584, %27601, %27595, %27602) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27603, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27604 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27605 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27606 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27607 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27608 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27609 = "torch.prim.ListConstruct"(%18479, %27604, %27605, %27606, %27607, %27608) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27610 = "torch.aten.view"(%27603, %27609) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27610, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27611 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %27612 = "torch.prim.ListConstruct"(%18479, %27611) : (!torch.int, !torch.int) -> !torch.list<int>
    %27613 = "torch.aten.view"(%27610, %27612) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27613, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %27614 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %27615 = "torch.aten.unsqueeze"(%27529, %27614) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27615, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27616 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27617 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27618 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27619 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27620 = "torch.prim.ListConstruct"(%27616, %18481, %27617, %27618, %27619) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27621 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27622 = "torch.aten.expand"(%27615, %27620, %27621) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27622, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27623 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27624 = "torch.aten.clone"(%27622, %27623) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27624, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27625 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27626 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27627 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27628 = "torch.prim.ListConstruct"(%27625, %18481, %27626, %27627) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27629 = "torch.aten._unsafe_view"(%27624, %27628) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27629, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27630 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %27631 = "torch.aten.unsqueeze"(%27229, %27630) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27631, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27632 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27633 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27634 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27635 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27636 = "torch.prim.ListConstruct"(%27632, %18481, %27633, %27634, %27635) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27637 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27638 = "torch.aten.expand"(%27631, %27636, %27637) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27638, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27639 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27640 = "torch.aten.clone"(%27638, %27639) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27640, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27641 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27642 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27643 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27644 = "torch.prim.ListConstruct"(%27641, %18481, %27642, %27643) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27645 = "torch.aten._unsafe_view"(%27640, %27644) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27645, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27646 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27647 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27648 = "torch.aten.transpose.int"(%27379, %27646, %27647) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27648, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27649 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27650 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27651 = "torch.aten.transpose.int"(%27629, %27649, %27650) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27651, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27652 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27653 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27654 = "torch.aten.transpose.int"(%27645, %27652, %27653) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27654, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27655 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27656 = "torch.aten.squeeze.dim"(%18570, %27655) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27656, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %27657 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27658 = "torch.aten.squeeze.dim"(%27656, %27657) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27658, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %27659 = "torch_c.to_builtin_tensor"(%27648) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %27660 = "torch_c.to_builtin_tensor"(%27651) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %27661 = "torch_c.to_builtin_tensor"(%27654) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %27662 = "torch_c.to_builtin_tensor"(%27658) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %27663 = "tensor.cast"(%27662) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %27664 = "torch_c.to_builtin_tensor"(%17729) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %27665 = "util.call"(%27659, %27660, %27661, %27664, %27663) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %27666 = "torch_c.from_builtin_tensor"(%27665) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%27666, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %27667 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27668 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27669 = "torch.aten.transpose.int"(%27666, %27667, %27668) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%27669, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %27670 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27671 = "torch.aten.clone"(%27669, %27670) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%27671, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %27672 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27673 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27674 = "torch.prim.ListConstruct"(%27672, %18481, %27673) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27675 = "torch.aten._unsafe_view"(%27671, %27674) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27675, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27676 = "torch.aten.div.Tensor"(%27675, %17731) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27676, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27677 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27678 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27679 = "torch.aten.clamp"(%27676, %27677, %27678) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27679, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27680 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27681 = "torch.prims.convert_element_type"(%27679, %27680) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27681, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27682 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27683 = "torch.aten.unsqueeze"(%17733, %27682) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %27684 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27685 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27686 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27687 = "torch.prim.ListConstruct"(%27684, %27685, %27686) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27688 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27689 = "torch.aten.expand"(%27683, %27687, %27688) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %27690 = "torch_c.to_builtin_tensor"(%27681) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27691 = "torch_c.to_builtin_tensor"(%27689) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %27692 = "util.call"(%27690, %27691) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %27693 = "torch_c.from_builtin_tensor"(%27692) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27693, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27694 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27695 = "torch.prims.convert_element_type"(%27693, %27694) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27695, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27696 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27697 = "torch.aten.add.Tensor"(%27123, %27695, %27696) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27697, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27698 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27699 = "torch.prims.convert_element_type"(%27697, %27698) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27699, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27700 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27701 = "torch.aten.pow.Tensor_Scalar"(%27699, %27700) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27701, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27702 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27703 = "torch.prim.ListConstruct"(%27702) : (!torch.int) -> !torch.list<int>
    %27704 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %27705 = "torch.constant.none"() : () -> !torch.none
    %27706 = "torch.aten.mean.dim"(%27701, %27703, %27704, %27705) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27706, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27707 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %27708 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27709 = "torch.aten.add.Scalar"(%27706, %27707, %27708) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27709, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27710 = "torch.aten.rsqrt"(%27709) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27710, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27711 = "torch.aten.mul.Tensor"(%27699, %27710) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27711, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27712 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27713 = "torch.prims.convert_element_type"(%27711, %27712) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27713, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27714 = "torch.aten.mul.Tensor"(%17735, %27713) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27714, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27715 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27716 = "torch.prims.convert_element_type"(%27714, %27715) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27716, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27717 = "torch.aten.div.Tensor"(%27716, %17737) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27717, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27718 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27719 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27720 = "torch.aten.clamp"(%27717, %27718, %27719) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27720, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27721 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27722 = "torch.prims.convert_element_type"(%27720, %27721) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27722, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27723 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27724 = "torch.aten.unsqueeze"(%17739, %27723) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %27725 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27726 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %27727 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27728 = "torch.prim.ListConstruct"(%27725, %27726, %27727) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27729 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27730 = "torch.aten.expand"(%27724, %27728, %27729) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %27731 = "torch_c.to_builtin_tensor"(%27722) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27732 = "torch_c.to_builtin_tensor"(%27730) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %27733 = "util.call"(%27731, %27732) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %27734 = "torch_c.from_builtin_tensor"(%27733) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%27734, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %27735 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27736 = "torch.prims.convert_element_type"(%27734, %27735) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27736, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27737 = "torch.aten.silu"(%27736) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27737, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27738 = "torch.aten.div.Tensor"(%27716, %17741) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27738, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27739 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27740 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27741 = "torch.aten.clamp"(%27738, %27739, %27740) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27741, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27742 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27743 = "torch.prims.convert_element_type"(%27741, %27742) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27743, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27744 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27745 = "torch.aten.unsqueeze"(%17743, %27744) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %27746 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27747 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %27748 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27749 = "torch.prim.ListConstruct"(%27746, %27747, %27748) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27750 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27751 = "torch.aten.expand"(%27745, %27749, %27750) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %27752 = "torch_c.to_builtin_tensor"(%27743) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27753 = "torch_c.to_builtin_tensor"(%27751) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %27754 = "util.call"(%27752, %27753) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %27755 = "torch_c.from_builtin_tensor"(%27754) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%27755, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %27756 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27757 = "torch.prims.convert_element_type"(%27755, %27756) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27757, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27758 = "torch.aten.mul.Tensor"(%27737, %27757) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27758, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27759 = "torch.aten.div.Tensor"(%27758, %17745) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27759, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27760 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27761 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27762 = "torch.aten.clamp"(%27759, %27760, %27761) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%27762, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %27763 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27764 = "torch.prims.convert_element_type"(%27762, %27763) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27764, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %27765 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27766 = "torch.aten.unsqueeze"(%17747, %27765) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %27767 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27768 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27769 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %27770 = "torch.prim.ListConstruct"(%27767, %27768, %27769) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27771 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27772 = "torch.aten.expand"(%27766, %27770, %27771) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %27773 = "torch_c.to_builtin_tensor"(%27764) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %27774 = "torch_c.to_builtin_tensor"(%27772) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %27775 = "util.call"(%27773, %27774) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %27776 = "torch_c.from_builtin_tensor"(%27775) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27776, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27777 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27778 = "torch.prims.convert_element_type"(%27776, %27777) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27778, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27780 = "torch.aten.add.Tensor"(%27697, %27778, %27779) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27780, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27781 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27782 = "torch.prims.convert_element_type"(%27780, %27781) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27782, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27783 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27784 = "torch.aten.pow.Tensor_Scalar"(%27782, %27783) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27784, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27785 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27786 = "torch.prim.ListConstruct"(%27785) : (!torch.int) -> !torch.list<int>
    %27787 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %27788 = "torch.constant.none"() : () -> !torch.none
    %27789 = "torch.aten.mean.dim"(%27784, %27786, %27787, %27788) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27789, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27790 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %27791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27792 = "torch.aten.add.Scalar"(%27789, %27790, %27791) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27792, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27793 = "torch.aten.rsqrt"(%27792) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%27793, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %27794 = "torch.aten.mul.Tensor"(%27782, %27793) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27794, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27795 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27796 = "torch.prims.convert_element_type"(%27794, %27795) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27796, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27797 = "torch.aten.mul.Tensor"(%17749, %27796) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27797, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27798 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27799 = "torch.prims.convert_element_type"(%27797, %27798) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27799, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27800 = "torch.aten.div.Tensor"(%27799, %17751) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27800, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27801 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27802 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27803 = "torch.aten.clamp"(%27800, %27801, %27802) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27803, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27804 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27805 = "torch.prims.convert_element_type"(%27803, %27804) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27805, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27806 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27807 = "torch.aten.unsqueeze"(%17753, %27806) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %27808 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27809 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27810 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27811 = "torch.prim.ListConstruct"(%27808, %27809, %27810) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27812 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27813 = "torch.aten.expand"(%27807, %27811, %27812) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %27814 = "torch_c.to_builtin_tensor"(%27805) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27815 = "torch_c.to_builtin_tensor"(%27813) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %27816 = "util.call"(%27814, %27815) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %27817 = "torch_c.from_builtin_tensor"(%27816) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27817, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27818 = "torch.aten.div.Tensor"(%27817, %17755) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27818, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27819 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27820 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27821 = "torch.aten.clamp"(%27818, %27819, %27820) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%27821, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %27822 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27823 = "torch.prims.convert_element_type"(%27821, %27822) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27823, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27824 = "torch.aten.div.Tensor"(%27799, %17757) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27824, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27825 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27826 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27827 = "torch.aten.clamp"(%27824, %27825, %27826) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27827, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27828 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27829 = "torch.prims.convert_element_type"(%27827, %27828) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27829, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27830 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27831 = "torch.aten.unsqueeze"(%17759, %27830) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %27832 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27833 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %27834 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27835 = "torch.prim.ListConstruct"(%27832, %27833, %27834) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27836 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27837 = "torch.aten.expand"(%27831, %27835, %27836) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %27838 = "torch_c.to_builtin_tensor"(%27829) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27839 = "torch_c.to_builtin_tensor"(%27837) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %27840 = "util.call"(%27838, %27839) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %27841 = "torch_c.from_builtin_tensor"(%27840) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27841, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27842 = "torch.aten.div.Tensor"(%27841, %17761) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27842, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27843 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27844 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27845 = "torch.aten.clamp"(%27842, %27843, %27844) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27845, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27846 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27847 = "torch.prims.convert_element_type"(%27845, %27846) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27847, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %27848 = "torch.aten.div.Tensor"(%27799, %17763) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27848, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27849 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27850 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27851 = "torch.aten.clamp"(%27848, %27849, %27850) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%27851, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %27852 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27853 = "torch.prims.convert_element_type"(%27851, %27852) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27853, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %27854 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27855 = "torch.aten.unsqueeze"(%17765, %27854) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %27856 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27857 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %27858 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %27859 = "torch.prim.ListConstruct"(%27856, %27857, %27858) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27860 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27861 = "torch.aten.expand"(%27855, %27859, %27860) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %27862 = "torch_c.to_builtin_tensor"(%27853) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %27863 = "torch_c.to_builtin_tensor"(%27861) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %27864 = "util.call"(%27862, %27863) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %27865 = "torch_c.from_builtin_tensor"(%27864) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27865, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27866 = "torch.aten.div.Tensor"(%27865, %17767) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27866, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27867 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %27868 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %27869 = "torch.aten.clamp"(%27866, %27867, %27868) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%27869, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %27870 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %27871 = "torch.prims.convert_element_type"(%27869, %27870) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27871, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %27872 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27873 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %27874 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27875 = "torch.prim.ListConstruct"(%27872, %18481, %27873, %27874) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27876 = "torch.aten.view"(%27823, %27875) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27876, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27877 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27878 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27879 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27880 = "torch.prim.ListConstruct"(%27877, %18481, %27878, %27879) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27881 = "torch.aten.view"(%27847, %27880) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27881, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27882 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27883 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27884 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27885 = "torch.prim.ListConstruct"(%27882, %18481, %27883, %27884) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27886 = "torch.aten.view"(%27871, %27885) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%27886, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %27887 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %27888 = "torch.constant.none"() : () -> !torch.none
    %27889 = "torch.constant.none"() : () -> !torch.none
    %27890 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %27891 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27892 = "torch.aten.arange"(%27887, %27888, %27889, %27890, %27891) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %27893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27894 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27895 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27896 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27897 = "torch.constant.none"() : () -> !torch.none
    %27898 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %27899 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %27900 = "torch.aten.arange.start_step"(%27893, %27894, %27895, %27896, %27897, %27898, %27899) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %27901 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27902 = "torch.prims.convert_element_type"(%27900, %27901) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %27903 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %27904 = "torch.aten.div.Scalar"(%27902, %27903) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27905 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %27906 = "torch.aten.pow.Scalar"(%27905, %27904) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27907 = "torch.aten.reciprocal"(%27906) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27908 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %27909 = "torch.aten.mul.Scalar"(%27907, %27908) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %27910 = "torch.aten.reciprocal"(%27909) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27911 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %27912 = "torch.aten.mul.Scalar"(%27910, %27911) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %27913 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %27914 = "torch.aten.gt.Scalar"(%27912, %27913) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27915 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27916 = "torch.aten.div.Scalar"(%27909, %27915) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27917 = "torch.aten.where.self"(%27914, %27916, %27909) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27918 = "torch.aten.reciprocal"(%27912) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27919 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %27920 = "torch.aten.mul.Scalar"(%27918, %27919) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27921 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27922 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27923 = "torch.aten.sub.Scalar"(%27920, %27921, %27922) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %27924 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27925 = "torch.aten.div.Scalar"(%27923, %27924) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27926 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27928 = "torch.aten.rsub.Scalar"(%27925, %27926, %27927) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %27929 = "torch.aten.mul.Tensor"(%27928, %27917) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27930 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %27931 = "torch.aten.div.Scalar"(%27929, %27930) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27932 = "torch.aten.mul.Tensor"(%27925, %27917) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27933 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27934 = "torch.aten.add.Tensor"(%27931, %27932, %27933) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %27935 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %27936 = "torch.aten.lt.Scalar"(%27912, %27935) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27937 = "torch.aten.bitwise_not"(%27936) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27938 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %27939 = "torch.aten.gt.Scalar"(%27912, %27938) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %27940 = "torch.aten.bitwise_not"(%27939) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27941 = "torch.aten.mul.Tensor"(%27937, %27940) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %27942 = "torch.aten.where.self"(%27941, %27934, %27917) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %27943 = "torch.prim.ListConstruct"(%27942, %27942) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %27944 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %27945 = "torch.aten.cat"(%27943, %27944) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %27946 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27947 = "torch.prims.convert_element_type"(%27892, %27946) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %27948 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %27949 = "torch.prims.convert_element_type"(%27945, %27948) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %27950 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %27951 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27952 = "torch.prim.ListConstruct"(%27950, %27951) : (!torch.int, !torch.int) -> !torch.list<int>
    %27953 = "torch.aten.view"(%27947, %27952) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %27954 = "torch.aten.mul.Tensor"(%27953, %27949) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27955 = "torch.aten.cos"(%27954) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27956 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27957 = "torch.prims.convert_element_type"(%27955, %27956) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %27958 = "torch.aten.sin"(%27954) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %27959 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %27960 = "torch.prims.convert_element_type"(%27958, %27959) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %27961 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27962 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27963 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27964 = "torch.aten.slice.Tensor"(%27957, %27961, %27962, %18481, %27963) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27964, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27965 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27966 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27967 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27968 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27969 = "torch.aten.slice.Tensor"(%27964, %27965, %27966, %27967, %27968) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27969, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27970 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27971 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27972 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27973 = "torch.aten.slice.Tensor"(%27960, %27970, %27971, %18481, %27972) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27973, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27974 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27975 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27976 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27977 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27978 = "torch.aten.slice.Tensor"(%27973, %27974, %27975, %27976, %27977) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%27978, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %27979 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27980 = "torch.aten.unsqueeze"(%27969, %27979) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27980, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27981 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27982 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27983 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27984 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27985 = "torch.aten.slice.Tensor"(%27980, %27981, %27982, %27983, %27984) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%27985, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %27986 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %27987 = "torch.aten.unsqueeze"(%27985, %27986) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27987, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27988 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %27989 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %27990 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %27991 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27992 = "torch.aten.slice.Tensor"(%27987, %27988, %27989, %27990, %27991) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27992, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %27993 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %27994 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27995 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27996 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %27997 = "torch.prim.ListConstruct"(%27993, %27994, %27995, %27996) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %27998 = "torch.aten.repeat"(%27992, %27997) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%27998, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %27999 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28000 = "torch.aten.unsqueeze"(%27978, %27999) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28000, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28001 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28002 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28003 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28004 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28005 = "torch.aten.slice.Tensor"(%28000, %28001, %28002, %28003, %28004) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28005, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28006 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28007 = "torch.aten.unsqueeze"(%28005, %28006) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28007, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28008 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28009 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28010 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28011 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28012 = "torch.aten.slice.Tensor"(%28007, %28008, %28009, %28010, %28011) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28012, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28013 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28014 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28015 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28016 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28017 = "torch.prim.ListConstruct"(%28013, %28014, %28015, %28016) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28018 = "torch.aten.repeat"(%28012, %28017) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28018, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %28019 = "torch.aten.mul.Tensor"(%27876, %27998) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28019, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28020 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28021 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28022 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28023 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28024 = "torch.aten.slice.Tensor"(%27876, %28020, %28021, %28022, %28023) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28024, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28025 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28026 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28027 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28028 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28029 = "torch.aten.slice.Tensor"(%27876, %28025, %28026, %28027, %28028) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28029, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28030 = "torch.aten.neg"(%28029) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28030, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28031 = "torch.prim.ListConstruct"(%28030, %28024) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %28032 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28033 = "torch.aten.cat"(%28031, %28032) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28033, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28034 = "torch.aten.mul.Tensor"(%28033, %28018) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28034, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28035 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28036 = "torch.aten.add.Tensor"(%28019, %28034, %28035) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28036, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28037 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %28038 = "torch.constant.none"() : () -> !torch.none
    %28039 = "torch.constant.none"() : () -> !torch.none
    %28040 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %28041 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28042 = "torch.aten.arange"(%28037, %28038, %28039, %28040, %28041) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %28043 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28044 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28045 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28046 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28047 = "torch.constant.none"() : () -> !torch.none
    %28048 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %28049 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28050 = "torch.aten.arange.start_step"(%28043, %28044, %28045, %28046, %28047, %28048, %28049) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %28051 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28052 = "torch.prims.convert_element_type"(%28050, %28051) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %28053 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28054 = "torch.aten.div.Scalar"(%28052, %28053) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28055 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %28056 = "torch.aten.pow.Scalar"(%28055, %28054) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28057 = "torch.aten.reciprocal"(%28056) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28058 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %28059 = "torch.aten.mul.Scalar"(%28057, %28058) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %28060 = "torch.aten.reciprocal"(%28059) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28061 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %28062 = "torch.aten.mul.Scalar"(%28060, %28061) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %28063 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %28064 = "torch.aten.gt.Scalar"(%28062, %28063) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28065 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28066 = "torch.aten.div.Scalar"(%28059, %28065) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28067 = "torch.aten.where.self"(%28064, %28066, %28059) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28068 = "torch.aten.reciprocal"(%28062) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28069 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %28070 = "torch.aten.mul.Scalar"(%28068, %28069) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28071 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28072 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28073 = "torch.aten.sub.Scalar"(%28070, %28071, %28072) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %28074 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28075 = "torch.aten.div.Scalar"(%28073, %28074) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28076 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28077 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28078 = "torch.aten.rsub.Scalar"(%28075, %28076, %28077) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %28079 = "torch.aten.mul.Tensor"(%28078, %28067) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28080 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28081 = "torch.aten.div.Scalar"(%28079, %28080) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28082 = "torch.aten.mul.Tensor"(%28075, %28067) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28083 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28084 = "torch.aten.add.Tensor"(%28081, %28082, %28083) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28085 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %28086 = "torch.aten.lt.Scalar"(%28062, %28085) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28087 = "torch.aten.bitwise_not"(%28086) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28088 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %28089 = "torch.aten.gt.Scalar"(%28062, %28088) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28090 = "torch.aten.bitwise_not"(%28089) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28091 = "torch.aten.mul.Tensor"(%28087, %28090) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28092 = "torch.aten.where.self"(%28091, %28084, %28067) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28093 = "torch.prim.ListConstruct"(%28092, %28092) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %28094 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28095 = "torch.aten.cat"(%28093, %28094) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %28096 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28097 = "torch.prims.convert_element_type"(%28042, %28096) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %28098 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28099 = "torch.prims.convert_element_type"(%28095, %28098) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %28100 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %28101 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28102 = "torch.prim.ListConstruct"(%28100, %28101) : (!torch.int, !torch.int) -> !torch.list<int>
    %28103 = "torch.aten.view"(%28097, %28102) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %28104 = "torch.aten.mul.Tensor"(%28103, %28099) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28105 = "torch.aten.cos"(%28104) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28106 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28107 = "torch.prims.convert_element_type"(%28105, %28106) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %28108 = "torch.aten.sin"(%28104) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28109 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28110 = "torch.prims.convert_element_type"(%28108, %28109) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %28111 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28112 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28113 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28114 = "torch.aten.slice.Tensor"(%28107, %28111, %28112, %18481, %28113) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28114, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28115 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28116 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28117 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28118 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28119 = "torch.aten.slice.Tensor"(%28114, %28115, %28116, %28117, %28118) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28119, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28120 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28121 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28122 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28123 = "torch.aten.slice.Tensor"(%28110, %28120, %28121, %18481, %28122) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28123, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28124 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28125 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28126 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28127 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28128 = "torch.aten.slice.Tensor"(%28123, %28124, %28125, %28126, %28127) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28128, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28129 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28130 = "torch.aten.unsqueeze"(%28119, %28129) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28130, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28131 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28132 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28133 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28134 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28135 = "torch.aten.slice.Tensor"(%28130, %28131, %28132, %28133, %28134) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28135, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28136 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28137 = "torch.aten.unsqueeze"(%28135, %28136) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28137, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28138 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28139 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28140 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28141 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28142 = "torch.aten.slice.Tensor"(%28137, %28138, %28139, %28140, %28141) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28142, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28143 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28144 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28145 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28146 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28147 = "torch.prim.ListConstruct"(%28143, %28144, %28145, %28146) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28148 = "torch.aten.repeat"(%28142, %28147) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28148, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %28149 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28150 = "torch.aten.unsqueeze"(%28128, %28149) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28150, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28151 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28152 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28153 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28154 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28155 = "torch.aten.slice.Tensor"(%28150, %28151, %28152, %28153, %28154) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28155, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28156 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28157 = "torch.aten.unsqueeze"(%28155, %28156) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28157, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28158 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28159 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28160 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28161 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28162 = "torch.aten.slice.Tensor"(%28157, %28158, %28159, %28160, %28161) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28162, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28163 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28164 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28165 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28166 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28167 = "torch.prim.ListConstruct"(%28163, %28164, %28165, %28166) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28168 = "torch.aten.repeat"(%28162, %28167) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28168, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %28169 = "torch.aten.mul.Tensor"(%27881, %28148) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28169, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28170 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28171 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28172 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28173 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28174 = "torch.aten.slice.Tensor"(%27881, %28170, %28171, %28172, %28173) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28174, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28175 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28176 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28177 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28178 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28179 = "torch.aten.slice.Tensor"(%27881, %28175, %28176, %28177, %28178) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28179, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28180 = "torch.aten.neg"(%28179) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28180, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28181 = "torch.prim.ListConstruct"(%28180, %28174) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %28182 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28183 = "torch.aten.cat"(%28181, %28182) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28183, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28184 = "torch.aten.mul.Tensor"(%28183, %28168) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28184, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28185 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28186 = "torch.aten.add.Tensor"(%28169, %28184, %28185) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28186, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28187 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28188 = "torch.aten.mul.Scalar"(%arg69, %28187) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%28188, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %28189 = "torch.constant.int"() <{value = 28 : i64}> : () -> !torch.int
    %28190 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28191 = "torch.aten.add.Scalar"(%28188, %28189, %28190) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%28191, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %28192 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28193 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28194 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28195 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28196 = "torch.prim.ListConstruct"(%28192, %18477, %28193, %28194, %28195) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28197 = "torch.aten.view"(%28186, %28196) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28197, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28198 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28199 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28200 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28201 = "torch.prim.ListConstruct"(%19011, %28198, %28199, %28200) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28202 = "torch.aten.view"(%28197, %28201) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28202, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28203 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %28204 = "torch.aten.view"(%28191, %28203) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%28204, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %28205 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28206 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28207 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28208 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28209 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28210 = "torch.prim.ListConstruct"(%18479, %28205, %28206, %28207, %28208, %28209) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28211 = "torch.aten.view"(%27613, %28210) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28211, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28212 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28213 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28214 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28215 = "torch.prim.ListConstruct"(%18993, %28212, %28213, %28214) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28216 = "torch.aten.view"(%28211, %28215) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28216, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28217 = "torch.prim.ListConstruct"(%28204) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %28218 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28219 = "torch.aten.index_put"(%28216, %28217, %28202, %28218) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28219, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28220 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28221 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28222 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28223 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28224 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28225 = "torch.prim.ListConstruct"(%18479, %28220, %28221, %28222, %28223, %28224) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28226 = "torch.aten.view"(%28219, %28225) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28226, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28227 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %28228 = "torch.prim.ListConstruct"(%18479, %28227) : (!torch.int, !torch.int) -> !torch.list<int>
    %28229 = "torch.aten.view"(%28226, %28228) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28229, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %28230 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28231 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28232 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28233 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28234 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28235 = "torch.prim.ListConstruct"(%18479, %28230, %28231, %28232, %28233, %28234) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28236 = "torch.aten.view"(%28229, %28235) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28236, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28237 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28238 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28239 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28240 = "torch.prim.ListConstruct"(%18993, %28237, %28238, %28239) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28241 = "torch.aten.view"(%28236, %28240) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28241, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28242 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28243 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28244 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28245 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28246 = "torch.prim.ListConstruct"(%28242, %18477, %28243, %28244, %28245) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28247 = "torch.aten.view"(%27886, %28246) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28247, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28248 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28249 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28250 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28251 = "torch.prim.ListConstruct"(%19011, %28248, %28249, %28250) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28252 = "torch.aten.view"(%28247, %28251) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28252, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28253 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28254 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28255 = "torch.aten.add.Scalar"(%28191, %28253, %28254) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%28255, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %28256 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %28257 = "torch.aten.view"(%28255, %28256) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%28257, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %28258 = "torch.prim.ListConstruct"(%28257) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %28259 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28260 = "torch.aten.index_put"(%28241, %28258, %28252, %28259) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28260, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28261 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28262 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28263 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28264 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28265 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28266 = "torch.prim.ListConstruct"(%18479, %28261, %28262, %28263, %28264, %28265) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28267 = "torch.aten.view"(%28260, %28266) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28267, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28268 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %28269 = "torch.prim.ListConstruct"(%18479, %28268) : (!torch.int, !torch.int) -> !torch.list<int>
    %28270 = "torch.aten.view"(%28267, %28269) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28270, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %28271 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %28272 = "torch.aten.unsqueeze"(%28186, %28271) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28272, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28273 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28274 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28275 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28276 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28277 = "torch.prim.ListConstruct"(%28273, %18481, %28274, %28275, %28276) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28278 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28279 = "torch.aten.expand"(%28272, %28277, %28278) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28279, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28280 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28281 = "torch.aten.clone"(%28279, %28280) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28281, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28282 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28283 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28284 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28285 = "torch.prim.ListConstruct"(%28282, %18481, %28283, %28284) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28286 = "torch.aten._unsafe_view"(%28281, %28285) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28286, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28287 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %28288 = "torch.aten.unsqueeze"(%27886, %28287) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28288, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28289 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28290 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28291 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28292 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28293 = "torch.prim.ListConstruct"(%28289, %18481, %28290, %28291, %28292) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28294 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28295 = "torch.aten.expand"(%28288, %28293, %28294) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28295, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28296 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28297 = "torch.aten.clone"(%28295, %28296) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28297, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28298 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28299 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28300 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28301 = "torch.prim.ListConstruct"(%28298, %18481, %28299, %28300) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28302 = "torch.aten._unsafe_view"(%28297, %28301) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28302, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28303 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28304 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28305 = "torch.aten.transpose.int"(%28036, %28303, %28304) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28305, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28306 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28307 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28308 = "torch.aten.transpose.int"(%28286, %28306, %28307) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28308, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28309 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28310 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28311 = "torch.aten.transpose.int"(%28302, %28309, %28310) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28311, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28312 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28313 = "torch.aten.squeeze.dim"(%18570, %28312) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28313, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %28314 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28315 = "torch.aten.squeeze.dim"(%28313, %28314) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28315, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %28316 = "torch_c.to_builtin_tensor"(%28305) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %28317 = "torch_c.to_builtin_tensor"(%28308) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %28318 = "torch_c.to_builtin_tensor"(%28311) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %28319 = "torch_c.to_builtin_tensor"(%28315) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %28320 = "tensor.cast"(%28319) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %28321 = "torch_c.to_builtin_tensor"(%17769) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %28322 = "util.call"(%28316, %28317, %28318, %28321, %28320) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %28323 = "torch_c.from_builtin_tensor"(%28322) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%28323, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %28324 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28325 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28326 = "torch.aten.transpose.int"(%28323, %28324, %28325) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%28326, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %28327 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28328 = "torch.aten.clone"(%28326, %28327) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%28328, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %28329 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28330 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28331 = "torch.prim.ListConstruct"(%28329, %18481, %28330) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28332 = "torch.aten._unsafe_view"(%28328, %28331) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28332, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28333 = "torch.aten.div.Tensor"(%28332, %17771) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28333, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28334 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28335 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28336 = "torch.aten.clamp"(%28333, %28334, %28335) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28336, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28337 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28338 = "torch.prims.convert_element_type"(%28336, %28337) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28338, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28339 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28340 = "torch.aten.unsqueeze"(%17773, %28339) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %28341 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28342 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28343 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28344 = "torch.prim.ListConstruct"(%28341, %28342, %28343) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28345 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28346 = "torch.aten.expand"(%28340, %28344, %28345) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %28347 = "torch_c.to_builtin_tensor"(%28338) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %28348 = "torch_c.to_builtin_tensor"(%28346) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %28349 = "util.call"(%28347, %28348) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %28350 = "torch_c.from_builtin_tensor"(%28349) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28350, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28351 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28352 = "torch.prims.convert_element_type"(%28350, %28351) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28352, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28353 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28354 = "torch.aten.add.Tensor"(%27780, %28352, %28353) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28354, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28355 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28356 = "torch.prims.convert_element_type"(%28354, %28355) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28356, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28357 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28358 = "torch.aten.pow.Tensor_Scalar"(%28356, %28357) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28358, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28359 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28360 = "torch.prim.ListConstruct"(%28359) : (!torch.int) -> !torch.list<int>
    %28361 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %28362 = "torch.constant.none"() : () -> !torch.none
    %28363 = "torch.aten.mean.dim"(%28358, %28360, %28361, %28362) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%28363, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %28364 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %28365 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28366 = "torch.aten.add.Scalar"(%28363, %28364, %28365) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%28366, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %28367 = "torch.aten.rsqrt"(%28366) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%28367, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %28368 = "torch.aten.mul.Tensor"(%28356, %28367) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28368, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28369 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28370 = "torch.prims.convert_element_type"(%28368, %28369) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28370, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28371 = "torch.aten.mul.Tensor"(%17775, %28370) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28371, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28372 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28373 = "torch.prims.convert_element_type"(%28371, %28372) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28373, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28374 = "torch.aten.div.Tensor"(%28373, %17777) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28374, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28375 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28376 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28377 = "torch.aten.clamp"(%28374, %28375, %28376) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28377, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28378 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28379 = "torch.prims.convert_element_type"(%28377, %28378) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28379, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28380 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28381 = "torch.aten.unsqueeze"(%17779, %28380) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %28382 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28383 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %28384 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28385 = "torch.prim.ListConstruct"(%28382, %28383, %28384) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28386 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28387 = "torch.aten.expand"(%28381, %28385, %28386) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %28388 = "torch_c.to_builtin_tensor"(%28379) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %28389 = "torch_c.to_builtin_tensor"(%28387) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %28390 = "util.call"(%28388, %28389) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %28391 = "torch_c.from_builtin_tensor"(%28390) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%28391, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %28392 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28393 = "torch.prims.convert_element_type"(%28391, %28392) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%28393, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %28394 = "torch.aten.silu"(%28393) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%28394, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %28395 = "torch.aten.div.Tensor"(%28373, %17781) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28395, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28396 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28397 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28398 = "torch.aten.clamp"(%28395, %28396, %28397) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28398, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28399 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28400 = "torch.prims.convert_element_type"(%28398, %28399) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28400, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28401 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28402 = "torch.aten.unsqueeze"(%17783, %28401) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %28403 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28404 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %28405 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28406 = "torch.prim.ListConstruct"(%28403, %28404, %28405) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28407 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28408 = "torch.aten.expand"(%28402, %28406, %28407) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %28409 = "torch_c.to_builtin_tensor"(%28400) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %28410 = "torch_c.to_builtin_tensor"(%28408) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %28411 = "util.call"(%28409, %28410) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %28412 = "torch_c.from_builtin_tensor"(%28411) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%28412, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %28413 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28414 = "torch.prims.convert_element_type"(%28412, %28413) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%28414, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %28415 = "torch.aten.mul.Tensor"(%28394, %28414) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%28415, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %28416 = "torch.aten.div.Tensor"(%28415, %17785) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%28416, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %28417 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28418 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28419 = "torch.aten.clamp"(%28416, %28417, %28418) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%28419, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %28420 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28421 = "torch.prims.convert_element_type"(%28419, %28420) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28421, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %28422 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28423 = "torch.aten.unsqueeze"(%17787, %28422) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %28424 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28425 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28426 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %28427 = "torch.prim.ListConstruct"(%28424, %28425, %28426) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28428 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28429 = "torch.aten.expand"(%28423, %28427, %28428) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %28430 = "torch_c.to_builtin_tensor"(%28421) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %28431 = "torch_c.to_builtin_tensor"(%28429) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %28432 = "util.call"(%28430, %28431) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %28433 = "torch_c.from_builtin_tensor"(%28432) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28433, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28434 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28435 = "torch.prims.convert_element_type"(%28433, %28434) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28435, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28437 = "torch.aten.add.Tensor"(%28354, %28435, %28436) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28437, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28438 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28439 = "torch.prims.convert_element_type"(%28437, %28438) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28439, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28440 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28441 = "torch.aten.pow.Tensor_Scalar"(%28439, %28440) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28441, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28442 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28443 = "torch.prim.ListConstruct"(%28442) : (!torch.int) -> !torch.list<int>
    %28444 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %28445 = "torch.constant.none"() : () -> !torch.none
    %28446 = "torch.aten.mean.dim"(%28441, %28443, %28444, %28445) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%28446, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %28447 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %28448 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28449 = "torch.aten.add.Scalar"(%28446, %28447, %28448) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%28449, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %28450 = "torch.aten.rsqrt"(%28449) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%28450, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %28451 = "torch.aten.mul.Tensor"(%28439, %28450) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28451, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28452 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28453 = "torch.prims.convert_element_type"(%28451, %28452) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28453, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28454 = "torch.aten.mul.Tensor"(%17789, %28453) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28454, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28455 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28456 = "torch.prims.convert_element_type"(%28454, %28455) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28456, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28457 = "torch.aten.div.Tensor"(%28456, %17791) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28457, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28458 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28459 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28460 = "torch.aten.clamp"(%28457, %28458, %28459) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28460, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28461 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28462 = "torch.prims.convert_element_type"(%28460, %28461) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28462, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28463 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28464 = "torch.aten.unsqueeze"(%17793, %28463) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %28465 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28466 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28467 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28468 = "torch.prim.ListConstruct"(%28465, %28466, %28467) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28469 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28470 = "torch.aten.expand"(%28464, %28468, %28469) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %28471 = "torch_c.to_builtin_tensor"(%28462) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %28472 = "torch_c.to_builtin_tensor"(%28470) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %28473 = "util.call"(%28471, %28472) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %28474 = "torch_c.from_builtin_tensor"(%28473) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28474, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28475 = "torch.aten.div.Tensor"(%28474, %17795) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28475, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28476 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28477 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28478 = "torch.aten.clamp"(%28475, %28476, %28477) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28478, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28479 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28480 = "torch.prims.convert_element_type"(%28478, %28479) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28480, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28481 = "torch.aten.div.Tensor"(%28456, %17797) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28481, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28482 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28483 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28484 = "torch.aten.clamp"(%28481, %28482, %28483) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28484, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28485 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28486 = "torch.prims.convert_element_type"(%28484, %28485) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28486, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28487 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28488 = "torch.aten.unsqueeze"(%17799, %28487) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %28489 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28490 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %28491 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28492 = "torch.prim.ListConstruct"(%28489, %28490, %28491) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28493 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28494 = "torch.aten.expand"(%28488, %28492, %28493) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %28495 = "torch_c.to_builtin_tensor"(%28486) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %28496 = "torch_c.to_builtin_tensor"(%28494) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %28497 = "util.call"(%28495, %28496) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %28498 = "torch_c.from_builtin_tensor"(%28497) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%28498, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %28499 = "torch.aten.div.Tensor"(%28498, %17801) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%28499, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %28500 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28501 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28502 = "torch.aten.clamp"(%28499, %28500, %28501) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%28502, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %28503 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28504 = "torch.prims.convert_element_type"(%28502, %28503) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28504, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %28505 = "torch.aten.div.Tensor"(%28456, %17803) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28505, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28506 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28507 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28508 = "torch.aten.clamp"(%28505, %28506, %28507) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%28508, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %28509 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28510 = "torch.prims.convert_element_type"(%28508, %28509) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28510, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28511 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28512 = "torch.aten.unsqueeze"(%17805, %28511) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %28513 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28514 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %28515 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28516 = "torch.prim.ListConstruct"(%28513, %28514, %28515) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28517 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28518 = "torch.aten.expand"(%28512, %28516, %28517) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %28519 = "torch_c.to_builtin_tensor"(%28510) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %28520 = "torch_c.to_builtin_tensor"(%28518) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %28521 = "util.call"(%28519, %28520) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %28522 = "torch_c.from_builtin_tensor"(%28521) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%28522, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %28523 = "torch.aten.div.Tensor"(%28522, %17807) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%28523, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %28524 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28525 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28526 = "torch.aten.clamp"(%28523, %28524, %28525) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%28526, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %28527 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28528 = "torch.prims.convert_element_type"(%28526, %28527) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28528, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %28529 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28530 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28531 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28532 = "torch.prim.ListConstruct"(%28529, %18481, %28530, %28531) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28533 = "torch.aten.view"(%28480, %28532) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28533, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28534 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28535 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28536 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28537 = "torch.prim.ListConstruct"(%28534, %18481, %28535, %28536) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28538 = "torch.aten.view"(%28504, %28537) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28538, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28539 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28540 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28541 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28542 = "torch.prim.ListConstruct"(%28539, %18481, %28540, %28541) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28543 = "torch.aten.view"(%28528, %28542) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28543, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28544 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %28545 = "torch.constant.none"() : () -> !torch.none
    %28546 = "torch.constant.none"() : () -> !torch.none
    %28547 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %28548 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28549 = "torch.aten.arange"(%28544, %28545, %28546, %28547, %28548) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %28550 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28551 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28552 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28553 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28554 = "torch.constant.none"() : () -> !torch.none
    %28555 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %28556 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28557 = "torch.aten.arange.start_step"(%28550, %28551, %28552, %28553, %28554, %28555, %28556) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %28558 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28559 = "torch.prims.convert_element_type"(%28557, %28558) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %28560 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28561 = "torch.aten.div.Scalar"(%28559, %28560) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28562 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %28563 = "torch.aten.pow.Scalar"(%28562, %28561) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28564 = "torch.aten.reciprocal"(%28563) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28565 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %28566 = "torch.aten.mul.Scalar"(%28564, %28565) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %28567 = "torch.aten.reciprocal"(%28566) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28568 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %28569 = "torch.aten.mul.Scalar"(%28567, %28568) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %28570 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %28571 = "torch.aten.gt.Scalar"(%28569, %28570) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28572 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28573 = "torch.aten.div.Scalar"(%28566, %28572) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28574 = "torch.aten.where.self"(%28571, %28573, %28566) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28575 = "torch.aten.reciprocal"(%28569) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28576 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %28577 = "torch.aten.mul.Scalar"(%28575, %28576) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28578 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28579 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28580 = "torch.aten.sub.Scalar"(%28577, %28578, %28579) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %28581 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28582 = "torch.aten.div.Scalar"(%28580, %28581) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28583 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28584 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28585 = "torch.aten.rsub.Scalar"(%28582, %28583, %28584) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %28586 = "torch.aten.mul.Tensor"(%28585, %28574) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28587 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28588 = "torch.aten.div.Scalar"(%28586, %28587) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28589 = "torch.aten.mul.Tensor"(%28582, %28574) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28590 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28591 = "torch.aten.add.Tensor"(%28588, %28589, %28590) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28592 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %28593 = "torch.aten.lt.Scalar"(%28569, %28592) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28594 = "torch.aten.bitwise_not"(%28593) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28595 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %28596 = "torch.aten.gt.Scalar"(%28569, %28595) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28597 = "torch.aten.bitwise_not"(%28596) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28598 = "torch.aten.mul.Tensor"(%28594, %28597) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28599 = "torch.aten.where.self"(%28598, %28591, %28574) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28600 = "torch.prim.ListConstruct"(%28599, %28599) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %28601 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28602 = "torch.aten.cat"(%28600, %28601) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %28603 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28604 = "torch.prims.convert_element_type"(%28549, %28603) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %28605 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28606 = "torch.prims.convert_element_type"(%28602, %28605) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %28607 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %28608 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28609 = "torch.prim.ListConstruct"(%28607, %28608) : (!torch.int, !torch.int) -> !torch.list<int>
    %28610 = "torch.aten.view"(%28604, %28609) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %28611 = "torch.aten.mul.Tensor"(%28610, %28606) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28612 = "torch.aten.cos"(%28611) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28613 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28614 = "torch.prims.convert_element_type"(%28612, %28613) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %28615 = "torch.aten.sin"(%28611) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28616 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28617 = "torch.prims.convert_element_type"(%28615, %28616) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %28618 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28619 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28621 = "torch.aten.slice.Tensor"(%28614, %28618, %28619, %18481, %28620) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28621, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28622 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28623 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28624 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28625 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28626 = "torch.aten.slice.Tensor"(%28621, %28622, %28623, %28624, %28625) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28626, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28627 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28628 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28629 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28630 = "torch.aten.slice.Tensor"(%28617, %28627, %28628, %18481, %28629) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28630, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28631 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28632 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28633 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28634 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28635 = "torch.aten.slice.Tensor"(%28630, %28631, %28632, %28633, %28634) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28635, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28636 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28637 = "torch.aten.unsqueeze"(%28626, %28636) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28637, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28638 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28639 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28640 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28641 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28642 = "torch.aten.slice.Tensor"(%28637, %28638, %28639, %28640, %28641) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28642, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28643 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28644 = "torch.aten.unsqueeze"(%28642, %28643) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28644, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28645 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28646 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28647 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28648 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28649 = "torch.aten.slice.Tensor"(%28644, %28645, %28646, %28647, %28648) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28649, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28650 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28651 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28652 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28653 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28654 = "torch.prim.ListConstruct"(%28650, %28651, %28652, %28653) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28655 = "torch.aten.repeat"(%28649, %28654) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28655, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %28656 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28657 = "torch.aten.unsqueeze"(%28635, %28656) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28657, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28658 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28659 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28660 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28661 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28662 = "torch.aten.slice.Tensor"(%28657, %28658, %28659, %28660, %28661) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28662, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28663 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28664 = "torch.aten.unsqueeze"(%28662, %28663) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28664, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28665 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28666 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28667 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28668 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28669 = "torch.aten.slice.Tensor"(%28664, %28665, %28666, %28667, %28668) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28669, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28670 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28671 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28672 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28673 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28674 = "torch.prim.ListConstruct"(%28670, %28671, %28672, %28673) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28675 = "torch.aten.repeat"(%28669, %28674) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28675, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %28676 = "torch.aten.mul.Tensor"(%28533, %28655) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28676, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28677 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28678 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28679 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28680 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28681 = "torch.aten.slice.Tensor"(%28533, %28677, %28678, %28679, %28680) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28681, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28682 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28683 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28684 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28685 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28686 = "torch.aten.slice.Tensor"(%28533, %28682, %28683, %28684, %28685) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28686, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28687 = "torch.aten.neg"(%28686) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28687, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28688 = "torch.prim.ListConstruct"(%28687, %28681) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %28689 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28690 = "torch.aten.cat"(%28688, %28689) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28690, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28691 = "torch.aten.mul.Tensor"(%28690, %28675) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28691, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28692 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28693 = "torch.aten.add.Tensor"(%28676, %28691, %28692) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28693, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28694 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %28695 = "torch.constant.none"() : () -> !torch.none
    %28696 = "torch.constant.none"() : () -> !torch.none
    %28697 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %28698 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28699 = "torch.aten.arange"(%28694, %28695, %28696, %28697, %28698) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %28700 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28701 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28702 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28703 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28704 = "torch.constant.none"() : () -> !torch.none
    %28705 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %28706 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28707 = "torch.aten.arange.start_step"(%28700, %28701, %28702, %28703, %28704, %28705, %28706) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %28708 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28709 = "torch.prims.convert_element_type"(%28707, %28708) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %28710 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28711 = "torch.aten.div.Scalar"(%28709, %28710) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28712 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %28713 = "torch.aten.pow.Scalar"(%28712, %28711) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28714 = "torch.aten.reciprocal"(%28713) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28715 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %28716 = "torch.aten.mul.Scalar"(%28714, %28715) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %28717 = "torch.aten.reciprocal"(%28716) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28718 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %28719 = "torch.aten.mul.Scalar"(%28717, %28718) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %28720 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %28721 = "torch.aten.gt.Scalar"(%28719, %28720) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28722 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28723 = "torch.aten.div.Scalar"(%28716, %28722) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28724 = "torch.aten.where.self"(%28721, %28723, %28716) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28725 = "torch.aten.reciprocal"(%28719) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28726 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %28727 = "torch.aten.mul.Scalar"(%28725, %28726) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28728 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28729 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28730 = "torch.aten.sub.Scalar"(%28727, %28728, %28729) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %28731 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28732 = "torch.aten.div.Scalar"(%28730, %28731) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28733 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28734 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28735 = "torch.aten.rsub.Scalar"(%28732, %28733, %28734) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %28736 = "torch.aten.mul.Tensor"(%28735, %28724) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28737 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28738 = "torch.aten.div.Scalar"(%28736, %28737) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28739 = "torch.aten.mul.Tensor"(%28732, %28724) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28740 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28741 = "torch.aten.add.Tensor"(%28738, %28739, %28740) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %28742 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %28743 = "torch.aten.lt.Scalar"(%28719, %28742) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28744 = "torch.aten.bitwise_not"(%28743) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28745 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %28746 = "torch.aten.gt.Scalar"(%28719, %28745) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %28747 = "torch.aten.bitwise_not"(%28746) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28748 = "torch.aten.mul.Tensor"(%28744, %28747) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %28749 = "torch.aten.where.self"(%28748, %28741, %28724) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %28750 = "torch.prim.ListConstruct"(%28749, %28749) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %28751 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28752 = "torch.aten.cat"(%28750, %28751) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %28753 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28754 = "torch.prims.convert_element_type"(%28699, %28753) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %28755 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %28756 = "torch.prims.convert_element_type"(%28752, %28755) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %28757 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %28758 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28759 = "torch.prim.ListConstruct"(%28757, %28758) : (!torch.int, !torch.int) -> !torch.list<int>
    %28760 = "torch.aten.view"(%28754, %28759) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %28761 = "torch.aten.mul.Tensor"(%28760, %28756) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28762 = "torch.aten.cos"(%28761) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28763 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28764 = "torch.prims.convert_element_type"(%28762, %28763) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %28765 = "torch.aten.sin"(%28761) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %28766 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %28767 = "torch.prims.convert_element_type"(%28765, %28766) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %28768 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28769 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28770 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28771 = "torch.aten.slice.Tensor"(%28764, %28768, %28769, %18481, %28770) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28771, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28772 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28773 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28774 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28775 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28776 = "torch.aten.slice.Tensor"(%28771, %28772, %28773, %28774, %28775) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28776, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28777 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28778 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28780 = "torch.aten.slice.Tensor"(%28767, %28777, %28778, %18481, %28779) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28780, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28781 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28782 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28783 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28784 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28785 = "torch.aten.slice.Tensor"(%28780, %28781, %28782, %28783, %28784) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%28785, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %28786 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28787 = "torch.aten.unsqueeze"(%28776, %28786) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28787, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28788 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28789 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28790 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28792 = "torch.aten.slice.Tensor"(%28787, %28788, %28789, %28790, %28791) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28792, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28793 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28794 = "torch.aten.unsqueeze"(%28792, %28793) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28794, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28795 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28796 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28797 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28798 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28799 = "torch.aten.slice.Tensor"(%28794, %28795, %28796, %28797, %28798) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28799, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28800 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28801 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28802 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28803 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28804 = "torch.prim.ListConstruct"(%28800, %28801, %28802, %28803) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28805 = "torch.aten.repeat"(%28799, %28804) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28805, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %28806 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28807 = "torch.aten.unsqueeze"(%28785, %28806) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28807, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28808 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28809 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28810 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28811 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28812 = "torch.aten.slice.Tensor"(%28807, %28808, %28809, %28810, %28811) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%28812, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %28813 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28814 = "torch.aten.unsqueeze"(%28812, %28813) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28814, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28815 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28816 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28817 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28818 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28819 = "torch.aten.slice.Tensor"(%28814, %28815, %28816, %28817, %28818) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28819, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %28820 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28821 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28822 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28823 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28824 = "torch.prim.ListConstruct"(%28820, %28821, %28822, %28823) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28825 = "torch.aten.repeat"(%28819, %28824) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%28825, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %28826 = "torch.aten.mul.Tensor"(%28538, %28805) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28826, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28827 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28828 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28829 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28830 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28831 = "torch.aten.slice.Tensor"(%28538, %28827, %28828, %28829, %28830) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28831, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28832 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %28833 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28834 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %28835 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28836 = "torch.aten.slice.Tensor"(%28538, %28832, %28833, %28834, %28835) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28836, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28837 = "torch.aten.neg"(%28836) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28837, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %28838 = "torch.prim.ListConstruct"(%28837, %28831) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %28839 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %28840 = "torch.aten.cat"(%28838, %28839) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28840, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28841 = "torch.aten.mul.Tensor"(%28840, %28825) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28841, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28842 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28843 = "torch.aten.add.Tensor"(%28826, %28841, %28842) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28843, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28844 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %28845 = "torch.aten.mul.Scalar"(%arg69, %28844) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%28845, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %28846 = "torch.constant.int"() <{value = 30 : i64}> : () -> !torch.int
    %28847 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28848 = "torch.aten.add.Scalar"(%28845, %28846, %28847) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%28848, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %28849 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28850 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28851 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28852 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28853 = "torch.prim.ListConstruct"(%28849, %18477, %28850, %28851, %28852) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28854 = "torch.aten.view"(%28843, %28853) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28854, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28855 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28856 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28857 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28858 = "torch.prim.ListConstruct"(%19011, %28855, %28856, %28857) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28859 = "torch.aten.view"(%28854, %28858) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28859, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28860 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %28861 = "torch.aten.view"(%28848, %28860) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%28861, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %28862 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28863 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28864 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28865 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28866 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28867 = "torch.prim.ListConstruct"(%18479, %28862, %28863, %28864, %28865, %28866) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28868 = "torch.aten.view"(%28270, %28867) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28868, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28869 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28870 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28871 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28872 = "torch.prim.ListConstruct"(%18993, %28869, %28870, %28871) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28873 = "torch.aten.view"(%28868, %28872) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28873, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28874 = "torch.prim.ListConstruct"(%28861) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %28875 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28876 = "torch.aten.index_put"(%28873, %28874, %28859, %28875) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28876, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28877 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28878 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28879 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28880 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28881 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28882 = "torch.prim.ListConstruct"(%18479, %28877, %28878, %28879, %28880, %28881) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28883 = "torch.aten.view"(%28876, %28882) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28883, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28884 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %28885 = "torch.prim.ListConstruct"(%18479, %28884) : (!torch.int, !torch.int) -> !torch.list<int>
    %28886 = "torch.aten.view"(%28883, %28885) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28886, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %28887 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28888 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28889 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28890 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28891 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28892 = "torch.prim.ListConstruct"(%18479, %28887, %28888, %28889, %28890, %28891) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28893 = "torch.aten.view"(%28886, %28892) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28893, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28894 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28895 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28896 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28897 = "torch.prim.ListConstruct"(%18993, %28894, %28895, %28896) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28898 = "torch.aten.view"(%28893, %28897) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28898, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28899 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28900 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28901 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28902 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28903 = "torch.prim.ListConstruct"(%28899, %18477, %28900, %28901, %28902) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28904 = "torch.aten.view"(%28543, %28903) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28904, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28905 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28906 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28907 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28908 = "torch.prim.ListConstruct"(%19011, %28905, %28906, %28907) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28909 = "torch.aten.view"(%28904, %28908) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28909, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28910 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28911 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28912 = "torch.aten.add.Scalar"(%28848, %28910, %28911) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%28912, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %28913 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %28914 = "torch.aten.view"(%28912, %28913) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%28914, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %28915 = "torch.prim.ListConstruct"(%28914) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %28916 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28917 = "torch.aten.index_put"(%28898, %28915, %28909, %28916) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28917, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28918 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28919 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28920 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28921 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28922 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28923 = "torch.prim.ListConstruct"(%18479, %28918, %28919, %28920, %28921, %28922) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28924 = "torch.aten.view"(%28917, %28923) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28924, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28925 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %28926 = "torch.prim.ListConstruct"(%18479, %28925) : (!torch.int, !torch.int) -> !torch.list<int>
    %28927 = "torch.aten.view"(%28924, %28926) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28927, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %28928 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %28929 = "torch.aten.unsqueeze"(%28843, %28928) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28929, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28930 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28931 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28932 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28933 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28934 = "torch.prim.ListConstruct"(%28930, %18481, %28931, %28932, %28933) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28935 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28936 = "torch.aten.expand"(%28929, %28934, %28935) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28936, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28937 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28938 = "torch.aten.clone"(%28936, %28937) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28938, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28939 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28940 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28941 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28942 = "torch.prim.ListConstruct"(%28939, %18481, %28940, %28941) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28943 = "torch.aten._unsafe_view"(%28938, %28942) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28943, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28944 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %28945 = "torch.aten.unsqueeze"(%28543, %28944) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28945, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28946 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28947 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %28948 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28949 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28950 = "torch.prim.ListConstruct"(%28946, %18481, %28947, %28948, %28949) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28951 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %28952 = "torch.aten.expand"(%28945, %28950, %28951) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28952, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28953 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28954 = "torch.aten.clone"(%28952, %28953) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28954, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28955 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28956 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %28957 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %28958 = "torch.prim.ListConstruct"(%28955, %18481, %28956, %28957) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28959 = "torch.aten._unsafe_view"(%28954, %28958) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28959, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28960 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28961 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28962 = "torch.aten.transpose.int"(%28693, %28960, %28961) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28962, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28963 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28964 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28965 = "torch.aten.transpose.int"(%28943, %28963, %28964) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28965, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28966 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28967 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28968 = "torch.aten.transpose.int"(%28959, %28966, %28967) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28968, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %28969 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28970 = "torch.aten.squeeze.dim"(%18570, %28969) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28970, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %28971 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28972 = "torch.aten.squeeze.dim"(%28970, %28971) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28972, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %28973 = "torch_c.to_builtin_tensor"(%28962) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %28974 = "torch_c.to_builtin_tensor"(%28965) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %28975 = "torch_c.to_builtin_tensor"(%28968) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %28976 = "torch_c.to_builtin_tensor"(%28972) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %28977 = "tensor.cast"(%28976) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %28978 = "torch_c.to_builtin_tensor"(%17809) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %28979 = "util.call"(%28973, %28974, %28975, %28978, %28977) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %28980 = "torch_c.from_builtin_tensor"(%28979) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%28980, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %28981 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %28982 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %28983 = "torch.aten.transpose.int"(%28980, %28981, %28982) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%28983, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %28984 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28985 = "torch.aten.clone"(%28983, %28984) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%28985, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %28986 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28987 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %28988 = "torch.prim.ListConstruct"(%28986, %18481, %28987) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %28989 = "torch.aten._unsafe_view"(%28985, %28988) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28989, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28990 = "torch.aten.div.Tensor"(%28989, %17811) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28990, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28991 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %28992 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %28993 = "torch.aten.clamp"(%28990, %28991, %28992) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%28993, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %28994 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %28995 = "torch.prims.convert_element_type"(%28993, %28994) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%28995, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %28996 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %28997 = "torch.aten.unsqueeze"(%17813, %28996) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %28998 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %28999 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29000 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29001 = "torch.prim.ListConstruct"(%28998, %28999, %29000) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29002 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29003 = "torch.aten.expand"(%28997, %29001, %29002) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %29004 = "torch_c.to_builtin_tensor"(%28995) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29005 = "torch_c.to_builtin_tensor"(%29003) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %29006 = "util.call"(%29004, %29005) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %29007 = "torch_c.from_builtin_tensor"(%29006) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29007, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29008 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29009 = "torch.prims.convert_element_type"(%29007, %29008) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29009, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29010 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29011 = "torch.aten.add.Tensor"(%28437, %29009, %29010) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29011, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29012 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29013 = "torch.prims.convert_element_type"(%29011, %29012) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29013, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29014 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29015 = "torch.aten.pow.Tensor_Scalar"(%29013, %29014) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29015, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29016 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29017 = "torch.prim.ListConstruct"(%29016) : (!torch.int) -> !torch.list<int>
    %29018 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %29019 = "torch.constant.none"() : () -> !torch.none
    %29020 = "torch.aten.mean.dim"(%29015, %29017, %29018, %29019) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29020, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29021 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %29022 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29023 = "torch.aten.add.Scalar"(%29020, %29021, %29022) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29023, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29024 = "torch.aten.rsqrt"(%29023) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29024, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29025 = "torch.aten.mul.Tensor"(%29013, %29024) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29025, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29026 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29027 = "torch.prims.convert_element_type"(%29025, %29026) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29027, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29028 = "torch.aten.mul.Tensor"(%17815, %29027) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29028, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29029 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29030 = "torch.prims.convert_element_type"(%29028, %29029) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29030, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29031 = "torch.aten.div.Tensor"(%29030, %17817) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29031, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29032 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29033 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29034 = "torch.aten.clamp"(%29031, %29032, %29033) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29034, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29035 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29036 = "torch.prims.convert_element_type"(%29034, %29035) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29036, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29037 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29038 = "torch.aten.unsqueeze"(%17819, %29037) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %29039 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29040 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %29041 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29042 = "torch.prim.ListConstruct"(%29039, %29040, %29041) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29043 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29044 = "torch.aten.expand"(%29038, %29042, %29043) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %29045 = "torch_c.to_builtin_tensor"(%29036) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29046 = "torch_c.to_builtin_tensor"(%29044) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %29047 = "util.call"(%29045, %29046) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %29048 = "torch_c.from_builtin_tensor"(%29047) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%29048, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %29049 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29050 = "torch.prims.convert_element_type"(%29048, %29049) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29050, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29051 = "torch.aten.silu"(%29050) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29051, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29052 = "torch.aten.div.Tensor"(%29030, %17821) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29052, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29053 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29054 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29055 = "torch.aten.clamp"(%29052, %29053, %29054) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29055, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29056 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29057 = "torch.prims.convert_element_type"(%29055, %29056) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29057, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29058 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29059 = "torch.aten.unsqueeze"(%17823, %29058) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %29060 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29061 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %29062 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29063 = "torch.prim.ListConstruct"(%29060, %29061, %29062) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29064 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29065 = "torch.aten.expand"(%29059, %29063, %29064) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %29066 = "torch_c.to_builtin_tensor"(%29057) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29067 = "torch_c.to_builtin_tensor"(%29065) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %29068 = "util.call"(%29066, %29067) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %29069 = "torch_c.from_builtin_tensor"(%29068) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%29069, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %29070 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29071 = "torch.prims.convert_element_type"(%29069, %29070) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29071, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29072 = "torch.aten.mul.Tensor"(%29051, %29071) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29072, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29073 = "torch.aten.div.Tensor"(%29072, %17825) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29073, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29074 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29075 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29076 = "torch.aten.clamp"(%29073, %29074, %29075) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29076, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29077 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29078 = "torch.prims.convert_element_type"(%29076, %29077) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29078, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %29079 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29080 = "torch.aten.unsqueeze"(%17827, %29079) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %29081 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29082 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29083 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %29084 = "torch.prim.ListConstruct"(%29081, %29082, %29083) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29085 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29086 = "torch.aten.expand"(%29080, %29084, %29085) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %29087 = "torch_c.to_builtin_tensor"(%29078) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %29088 = "torch_c.to_builtin_tensor"(%29086) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %29089 = "util.call"(%29087, %29088) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %29090 = "torch_c.from_builtin_tensor"(%29089) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29090, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29091 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29092 = "torch.prims.convert_element_type"(%29090, %29091) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29092, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29093 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29094 = "torch.aten.add.Tensor"(%29011, %29092, %29093) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29094, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29095 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29096 = "torch.prims.convert_element_type"(%29094, %29095) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29096, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29097 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29098 = "torch.aten.pow.Tensor_Scalar"(%29096, %29097) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29098, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29099 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29100 = "torch.prim.ListConstruct"(%29099) : (!torch.int) -> !torch.list<int>
    %29101 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %29102 = "torch.constant.none"() : () -> !torch.none
    %29103 = "torch.aten.mean.dim"(%29098, %29100, %29101, %29102) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29103, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29104 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %29105 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29106 = "torch.aten.add.Scalar"(%29103, %29104, %29105) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29106, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29107 = "torch.aten.rsqrt"(%29106) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29107, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29108 = "torch.aten.mul.Tensor"(%29096, %29107) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29108, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29109 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29110 = "torch.prims.convert_element_type"(%29108, %29109) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29110, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29111 = "torch.aten.mul.Tensor"(%17829, %29110) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29111, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29112 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29113 = "torch.prims.convert_element_type"(%29111, %29112) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29113, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29114 = "torch.aten.div.Tensor"(%29113, %17831) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29114, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29115 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29116 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29117 = "torch.aten.clamp"(%29114, %29115, %29116) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29117, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29118 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29119 = "torch.prims.convert_element_type"(%29117, %29118) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29119, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29120 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29121 = "torch.aten.unsqueeze"(%17833, %29120) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %29122 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29123 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29124 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29125 = "torch.prim.ListConstruct"(%29122, %29123, %29124) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29126 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29127 = "torch.aten.expand"(%29121, %29125, %29126) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %29128 = "torch_c.to_builtin_tensor"(%29119) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29129 = "torch_c.to_builtin_tensor"(%29127) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %29130 = "util.call"(%29128, %29129) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %29131 = "torch_c.from_builtin_tensor"(%29130) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29131, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29132 = "torch.aten.div.Tensor"(%29131, %17835) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29132, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29133 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29134 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29135 = "torch.aten.clamp"(%29132, %29133, %29134) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29135, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29136 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29137 = "torch.prims.convert_element_type"(%29135, %29136) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29137, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29138 = "torch.aten.div.Tensor"(%29113, %17837) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29138, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29139 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29140 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29141 = "torch.aten.clamp"(%29138, %29139, %29140) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29141, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29142 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29143 = "torch.prims.convert_element_type"(%29141, %29142) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29143, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29144 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29145 = "torch.aten.unsqueeze"(%17839, %29144) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %29146 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29147 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %29148 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29149 = "torch.prim.ListConstruct"(%29146, %29147, %29148) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29150 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29151 = "torch.aten.expand"(%29145, %29149, %29150) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %29152 = "torch_c.to_builtin_tensor"(%29143) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29153 = "torch_c.to_builtin_tensor"(%29151) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %29154 = "util.call"(%29152, %29153) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %29155 = "torch_c.from_builtin_tensor"(%29154) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29155, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29156 = "torch.aten.div.Tensor"(%29155, %17841) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29156, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29157 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29158 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29159 = "torch.aten.clamp"(%29156, %29157, %29158) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29159, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29160 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29161 = "torch.prims.convert_element_type"(%29159, %29160) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29161, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %29162 = "torch.aten.div.Tensor"(%29113, %17843) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29162, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29163 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29164 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29165 = "torch.aten.clamp"(%29162, %29163, %29164) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29165, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29166 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29167 = "torch.prims.convert_element_type"(%29165, %29166) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29167, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29168 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29169 = "torch.aten.unsqueeze"(%17845, %29168) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %29170 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29171 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %29172 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29173 = "torch.prim.ListConstruct"(%29170, %29171, %29172) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29174 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29175 = "torch.aten.expand"(%29169, %29173, %29174) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %29176 = "torch_c.to_builtin_tensor"(%29167) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29177 = "torch_c.to_builtin_tensor"(%29175) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %29178 = "util.call"(%29176, %29177) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %29179 = "torch_c.from_builtin_tensor"(%29178) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29179, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29180 = "torch.aten.div.Tensor"(%29179, %17847) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29180, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29181 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29182 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29183 = "torch.aten.clamp"(%29180, %29181, %29182) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29183, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29184 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29185 = "torch.prims.convert_element_type"(%29183, %29184) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29185, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %29186 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29187 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29188 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29189 = "torch.prim.ListConstruct"(%29186, %18481, %29187, %29188) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29190 = "torch.aten.view"(%29137, %29189) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29190, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29191 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29192 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29193 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29194 = "torch.prim.ListConstruct"(%29191, %18481, %29192, %29193) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29195 = "torch.aten.view"(%29161, %29194) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29195, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29196 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29197 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29198 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29199 = "torch.prim.ListConstruct"(%29196, %18481, %29197, %29198) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29200 = "torch.aten.view"(%29185, %29199) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29200, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29201 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %29202 = "torch.constant.none"() : () -> !torch.none
    %29203 = "torch.constant.none"() : () -> !torch.none
    %29204 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %29205 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29206 = "torch.aten.arange"(%29201, %29202, %29203, %29204, %29205) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %29207 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29208 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29209 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29210 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29211 = "torch.constant.none"() : () -> !torch.none
    %29212 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %29213 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29214 = "torch.aten.arange.start_step"(%29207, %29208, %29209, %29210, %29211, %29212, %29213) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %29215 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29216 = "torch.prims.convert_element_type"(%29214, %29215) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %29217 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29218 = "torch.aten.div.Scalar"(%29216, %29217) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29219 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %29220 = "torch.aten.pow.Scalar"(%29219, %29218) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29221 = "torch.aten.reciprocal"(%29220) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29222 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %29223 = "torch.aten.mul.Scalar"(%29221, %29222) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %29224 = "torch.aten.reciprocal"(%29223) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29225 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %29226 = "torch.aten.mul.Scalar"(%29224, %29225) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %29227 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %29228 = "torch.aten.gt.Scalar"(%29226, %29227) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29229 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29230 = "torch.aten.div.Scalar"(%29223, %29229) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29231 = "torch.aten.where.self"(%29228, %29230, %29223) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29232 = "torch.aten.reciprocal"(%29226) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29233 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %29234 = "torch.aten.mul.Scalar"(%29232, %29233) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29235 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29236 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29237 = "torch.aten.sub.Scalar"(%29234, %29235, %29236) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %29238 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29239 = "torch.aten.div.Scalar"(%29237, %29238) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29240 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29241 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29242 = "torch.aten.rsub.Scalar"(%29239, %29240, %29241) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %29243 = "torch.aten.mul.Tensor"(%29242, %29231) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29244 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29245 = "torch.aten.div.Scalar"(%29243, %29244) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29246 = "torch.aten.mul.Tensor"(%29239, %29231) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29247 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29248 = "torch.aten.add.Tensor"(%29245, %29246, %29247) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29249 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %29250 = "torch.aten.lt.Scalar"(%29226, %29249) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29251 = "torch.aten.bitwise_not"(%29250) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29252 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %29253 = "torch.aten.gt.Scalar"(%29226, %29252) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29254 = "torch.aten.bitwise_not"(%29253) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29255 = "torch.aten.mul.Tensor"(%29251, %29254) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29256 = "torch.aten.where.self"(%29255, %29248, %29231) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29257 = "torch.prim.ListConstruct"(%29256, %29256) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %29258 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29259 = "torch.aten.cat"(%29257, %29258) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %29260 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29261 = "torch.prims.convert_element_type"(%29206, %29260) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %29262 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29263 = "torch.prims.convert_element_type"(%29259, %29262) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %29264 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %29265 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29266 = "torch.prim.ListConstruct"(%29264, %29265) : (!torch.int, !torch.int) -> !torch.list<int>
    %29267 = "torch.aten.view"(%29261, %29266) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %29268 = "torch.aten.mul.Tensor"(%29267, %29263) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29269 = "torch.aten.cos"(%29268) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29270 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29271 = "torch.prims.convert_element_type"(%29269, %29270) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %29272 = "torch.aten.sin"(%29268) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29273 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29274 = "torch.prims.convert_element_type"(%29272, %29273) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %29275 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29276 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29278 = "torch.aten.slice.Tensor"(%29271, %29275, %29276, %18481, %29277) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29278, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29279 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29280 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29281 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29282 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29283 = "torch.aten.slice.Tensor"(%29278, %29279, %29280, %29281, %29282) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29283, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29284 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29285 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29286 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29287 = "torch.aten.slice.Tensor"(%29274, %29284, %29285, %18481, %29286) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29287, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29288 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29289 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29290 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29291 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29292 = "torch.aten.slice.Tensor"(%29287, %29288, %29289, %29290, %29291) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29292, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29293 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29294 = "torch.aten.unsqueeze"(%29283, %29293) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29294, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29295 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29296 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29297 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29298 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29299 = "torch.aten.slice.Tensor"(%29294, %29295, %29296, %29297, %29298) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29299, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29300 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29301 = "torch.aten.unsqueeze"(%29299, %29300) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29301, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29302 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29303 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29304 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29305 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29306 = "torch.aten.slice.Tensor"(%29301, %29302, %29303, %29304, %29305) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29306, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29307 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29308 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29309 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29310 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29311 = "torch.prim.ListConstruct"(%29307, %29308, %29309, %29310) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29312 = "torch.aten.repeat"(%29306, %29311) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29312, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %29313 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29314 = "torch.aten.unsqueeze"(%29292, %29313) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29314, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29315 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29316 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29317 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29318 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29319 = "torch.aten.slice.Tensor"(%29314, %29315, %29316, %29317, %29318) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29319, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29320 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29321 = "torch.aten.unsqueeze"(%29319, %29320) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29321, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29322 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29323 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29324 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29325 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29326 = "torch.aten.slice.Tensor"(%29321, %29322, %29323, %29324, %29325) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29326, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29327 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29328 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29329 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29330 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29331 = "torch.prim.ListConstruct"(%29327, %29328, %29329, %29330) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29332 = "torch.aten.repeat"(%29326, %29331) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29332, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %29333 = "torch.aten.mul.Tensor"(%29190, %29312) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29333, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29334 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29335 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29336 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %29337 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29338 = "torch.aten.slice.Tensor"(%29190, %29334, %29335, %29336, %29337) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29338, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %29339 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29340 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %29341 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29342 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29343 = "torch.aten.slice.Tensor"(%29190, %29339, %29340, %29341, %29342) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29343, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %29344 = "torch.aten.neg"(%29343) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29344, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %29345 = "torch.prim.ListConstruct"(%29344, %29338) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %29346 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29347 = "torch.aten.cat"(%29345, %29346) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29347, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29348 = "torch.aten.mul.Tensor"(%29347, %29332) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29348, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29349 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29350 = "torch.aten.add.Tensor"(%29333, %29348, %29349) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29350, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29351 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %29352 = "torch.constant.none"() : () -> !torch.none
    %29353 = "torch.constant.none"() : () -> !torch.none
    %29354 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %29355 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29356 = "torch.aten.arange"(%29351, %29352, %29353, %29354, %29355) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %29357 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29358 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29359 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29360 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29361 = "torch.constant.none"() : () -> !torch.none
    %29362 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %29363 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29364 = "torch.aten.arange.start_step"(%29357, %29358, %29359, %29360, %29361, %29362, %29363) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %29365 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29366 = "torch.prims.convert_element_type"(%29364, %29365) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %29367 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29368 = "torch.aten.div.Scalar"(%29366, %29367) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29369 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %29370 = "torch.aten.pow.Scalar"(%29369, %29368) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29371 = "torch.aten.reciprocal"(%29370) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29372 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %29373 = "torch.aten.mul.Scalar"(%29371, %29372) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %29374 = "torch.aten.reciprocal"(%29373) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29375 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %29376 = "torch.aten.mul.Scalar"(%29374, %29375) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %29377 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %29378 = "torch.aten.gt.Scalar"(%29376, %29377) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29379 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29380 = "torch.aten.div.Scalar"(%29373, %29379) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29381 = "torch.aten.where.self"(%29378, %29380, %29373) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29382 = "torch.aten.reciprocal"(%29376) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29383 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %29384 = "torch.aten.mul.Scalar"(%29382, %29383) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29385 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29386 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29387 = "torch.aten.sub.Scalar"(%29384, %29385, %29386) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %29388 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29389 = "torch.aten.div.Scalar"(%29387, %29388) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29390 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29391 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29392 = "torch.aten.rsub.Scalar"(%29389, %29390, %29391) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %29393 = "torch.aten.mul.Tensor"(%29392, %29381) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29394 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29395 = "torch.aten.div.Scalar"(%29393, %29394) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29396 = "torch.aten.mul.Tensor"(%29389, %29381) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29397 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29398 = "torch.aten.add.Tensor"(%29395, %29396, %29397) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29399 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %29400 = "torch.aten.lt.Scalar"(%29376, %29399) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29401 = "torch.aten.bitwise_not"(%29400) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29402 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %29403 = "torch.aten.gt.Scalar"(%29376, %29402) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29404 = "torch.aten.bitwise_not"(%29403) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29405 = "torch.aten.mul.Tensor"(%29401, %29404) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29406 = "torch.aten.where.self"(%29405, %29398, %29381) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29407 = "torch.prim.ListConstruct"(%29406, %29406) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %29408 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29409 = "torch.aten.cat"(%29407, %29408) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %29410 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29411 = "torch.prims.convert_element_type"(%29356, %29410) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %29412 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29413 = "torch.prims.convert_element_type"(%29409, %29412) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %29414 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %29415 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29416 = "torch.prim.ListConstruct"(%29414, %29415) : (!torch.int, !torch.int) -> !torch.list<int>
    %29417 = "torch.aten.view"(%29411, %29416) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %29418 = "torch.aten.mul.Tensor"(%29417, %29413) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29419 = "torch.aten.cos"(%29418) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29420 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29421 = "torch.prims.convert_element_type"(%29419, %29420) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %29422 = "torch.aten.sin"(%29418) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29423 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29424 = "torch.prims.convert_element_type"(%29422, %29423) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %29425 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29426 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29427 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29428 = "torch.aten.slice.Tensor"(%29421, %29425, %29426, %18481, %29427) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29428, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29429 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29430 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29431 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29432 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29433 = "torch.aten.slice.Tensor"(%29428, %29429, %29430, %29431, %29432) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29433, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29434 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29435 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29437 = "torch.aten.slice.Tensor"(%29424, %29434, %29435, %18481, %29436) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29437, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29438 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29439 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29440 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29441 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29442 = "torch.aten.slice.Tensor"(%29437, %29438, %29439, %29440, %29441) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29442, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29443 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29444 = "torch.aten.unsqueeze"(%29433, %29443) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29444, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29445 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29446 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29447 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29448 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29449 = "torch.aten.slice.Tensor"(%29444, %29445, %29446, %29447, %29448) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29449, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29450 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29451 = "torch.aten.unsqueeze"(%29449, %29450) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29451, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29452 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29453 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29454 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29455 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29456 = "torch.aten.slice.Tensor"(%29451, %29452, %29453, %29454, %29455) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29456, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29457 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29458 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29459 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29460 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29461 = "torch.prim.ListConstruct"(%29457, %29458, %29459, %29460) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29462 = "torch.aten.repeat"(%29456, %29461) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29462, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %29463 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29464 = "torch.aten.unsqueeze"(%29442, %29463) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29464, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29465 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29466 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29467 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29468 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29469 = "torch.aten.slice.Tensor"(%29464, %29465, %29466, %29467, %29468) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29469, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29470 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29471 = "torch.aten.unsqueeze"(%29469, %29470) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29471, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29472 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29473 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29474 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29475 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29476 = "torch.aten.slice.Tensor"(%29471, %29472, %29473, %29474, %29475) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29476, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29477 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29478 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29479 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29480 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29481 = "torch.prim.ListConstruct"(%29477, %29478, %29479, %29480) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29482 = "torch.aten.repeat"(%29476, %29481) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29482, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %29483 = "torch.aten.mul.Tensor"(%29195, %29462) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29483, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29484 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29485 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29486 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %29487 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29488 = "torch.aten.slice.Tensor"(%29195, %29484, %29485, %29486, %29487) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29488, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %29489 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29490 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %29491 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29492 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29493 = "torch.aten.slice.Tensor"(%29195, %29489, %29490, %29491, %29492) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29493, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %29494 = "torch.aten.neg"(%29493) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29494, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %29495 = "torch.prim.ListConstruct"(%29494, %29488) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %29496 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29497 = "torch.aten.cat"(%29495, %29496) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29497, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29498 = "torch.aten.mul.Tensor"(%29497, %29482) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29498, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29499 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29500 = "torch.aten.add.Tensor"(%29483, %29498, %29499) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29500, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29501 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %29502 = "torch.aten.mul.Scalar"(%arg69, %29501) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%29502, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %29503 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29504 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29505 = "torch.aten.add.Scalar"(%29502, %29503, %29504) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%29505, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %29506 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29507 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29508 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29509 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29510 = "torch.prim.ListConstruct"(%29506, %18477, %29507, %29508, %29509) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29511 = "torch.aten.view"(%29500, %29510) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29511, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29512 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29513 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29514 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29515 = "torch.prim.ListConstruct"(%19011, %29512, %29513, %29514) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29516 = "torch.aten.view"(%29511, %29515) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29516, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29517 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %29518 = "torch.aten.view"(%29505, %29517) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%29518, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %29519 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29520 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29521 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29522 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29523 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29524 = "torch.prim.ListConstruct"(%18479, %29519, %29520, %29521, %29522, %29523) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29525 = "torch.aten.view"(%28927, %29524) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29525, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29526 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29527 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29528 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29529 = "torch.prim.ListConstruct"(%18993, %29526, %29527, %29528) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29530 = "torch.aten.view"(%29525, %29529) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29530, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29531 = "torch.prim.ListConstruct"(%29518) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %29532 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29533 = "torch.aten.index_put"(%29530, %29531, %29516, %29532) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29533, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29534 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29535 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29536 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29537 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29538 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29539 = "torch.prim.ListConstruct"(%18479, %29534, %29535, %29536, %29537, %29538) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29540 = "torch.aten.view"(%29533, %29539) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29540, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29541 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %29542 = "torch.prim.ListConstruct"(%18479, %29541) : (!torch.int, !torch.int) -> !torch.list<int>
    %29543 = "torch.aten.view"(%29540, %29542) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29543, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %29544 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29545 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29546 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29547 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29548 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29549 = "torch.prim.ListConstruct"(%18479, %29544, %29545, %29546, %29547, %29548) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29550 = "torch.aten.view"(%29543, %29549) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29550, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29551 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29552 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29553 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29554 = "torch.prim.ListConstruct"(%18993, %29551, %29552, %29553) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29555 = "torch.aten.view"(%29550, %29554) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29555, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29556 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29557 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29558 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29559 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29560 = "torch.prim.ListConstruct"(%29556, %18477, %29557, %29558, %29559) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29561 = "torch.aten.view"(%29200, %29560) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29561, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29562 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29563 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29564 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29565 = "torch.prim.ListConstruct"(%19011, %29562, %29563, %29564) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29566 = "torch.aten.view"(%29561, %29565) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29566, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29567 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29568 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29569 = "torch.aten.add.Scalar"(%29505, %29567, %29568) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%29569, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %29570 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %29571 = "torch.aten.view"(%29569, %29570) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%29571, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %29572 = "torch.prim.ListConstruct"(%29571) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %29573 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29574 = "torch.aten.index_put"(%29555, %29572, %29566, %29573) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29574, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29575 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29576 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29577 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29578 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29579 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29580 = "torch.prim.ListConstruct"(%18479, %29575, %29576, %29577, %29578, %29579) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29581 = "torch.aten.view"(%29574, %29580) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29581, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29582 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %29583 = "torch.prim.ListConstruct"(%18479, %29582) : (!torch.int, !torch.int) -> !torch.list<int>
    %29584 = "torch.aten.view"(%29581, %29583) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29584, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %29585 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %29586 = "torch.aten.unsqueeze"(%29500, %29585) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29586, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29587 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29588 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29589 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29590 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29591 = "torch.prim.ListConstruct"(%29587, %18481, %29588, %29589, %29590) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29592 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29593 = "torch.aten.expand"(%29586, %29591, %29592) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29593, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29594 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29595 = "torch.aten.clone"(%29593, %29594) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29595, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29596 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29597 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29598 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29599 = "torch.prim.ListConstruct"(%29596, %18481, %29597, %29598) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29600 = "torch.aten._unsafe_view"(%29595, %29599) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29600, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29601 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %29602 = "torch.aten.unsqueeze"(%29200, %29601) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29602, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29603 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29604 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29605 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29606 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29607 = "torch.prim.ListConstruct"(%29603, %18481, %29604, %29605, %29606) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29608 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29609 = "torch.aten.expand"(%29602, %29607, %29608) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29609, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29610 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29611 = "torch.aten.clone"(%29609, %29610) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29611, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29612 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29613 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29614 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29615 = "torch.prim.ListConstruct"(%29612, %18481, %29613, %29614) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29616 = "torch.aten._unsafe_view"(%29611, %29615) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29616, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29617 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29618 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29619 = "torch.aten.transpose.int"(%29350, %29617, %29618) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29619, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29621 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29622 = "torch.aten.transpose.int"(%29600, %29620, %29621) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29622, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29623 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29624 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29625 = "torch.aten.transpose.int"(%29616, %29623, %29624) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29625, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29626 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29627 = "torch.aten.squeeze.dim"(%18570, %29626) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29627, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %29628 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29629 = "torch.aten.squeeze.dim"(%29627, %29628) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29629, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %29630 = "torch_c.to_builtin_tensor"(%29619) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %29631 = "torch_c.to_builtin_tensor"(%29622) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %29632 = "torch_c.to_builtin_tensor"(%29625) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %29633 = "torch_c.to_builtin_tensor"(%29629) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %29634 = "tensor.cast"(%29633) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %29635 = "torch_c.to_builtin_tensor"(%17849) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %29636 = "util.call"(%29630, %29631, %29632, %29635, %29634) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %29637 = "torch_c.from_builtin_tensor"(%29636) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%29637, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %29638 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29639 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29640 = "torch.aten.transpose.int"(%29637, %29638, %29639) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%29640, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %29641 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29642 = "torch.aten.clone"(%29640, %29641) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%29642, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %29643 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29644 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29645 = "torch.prim.ListConstruct"(%29643, %18481, %29644) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29646 = "torch.aten._unsafe_view"(%29642, %29645) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29646, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29647 = "torch.aten.div.Tensor"(%29646, %17851) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29647, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29648 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29649 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29650 = "torch.aten.clamp"(%29647, %29648, %29649) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29650, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29651 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29652 = "torch.prims.convert_element_type"(%29650, %29651) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29652, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29653 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29654 = "torch.aten.unsqueeze"(%17853, %29653) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %29655 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29656 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29657 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29658 = "torch.prim.ListConstruct"(%29655, %29656, %29657) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29659 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29660 = "torch.aten.expand"(%29654, %29658, %29659) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %29661 = "torch_c.to_builtin_tensor"(%29652) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29662 = "torch_c.to_builtin_tensor"(%29660) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %29663 = "util.call"(%29661, %29662) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %29664 = "torch_c.from_builtin_tensor"(%29663) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29664, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29665 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29666 = "torch.prims.convert_element_type"(%29664, %29665) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29666, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29667 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29668 = "torch.aten.add.Tensor"(%29094, %29666, %29667) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29668, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29669 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29670 = "torch.prims.convert_element_type"(%29668, %29669) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29670, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29671 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29672 = "torch.aten.pow.Tensor_Scalar"(%29670, %29671) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29672, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29673 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29674 = "torch.prim.ListConstruct"(%29673) : (!torch.int) -> !torch.list<int>
    %29675 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %29676 = "torch.constant.none"() : () -> !torch.none
    %29677 = "torch.aten.mean.dim"(%29672, %29674, %29675, %29676) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29677, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29678 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %29679 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29680 = "torch.aten.add.Scalar"(%29677, %29678, %29679) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29680, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29681 = "torch.aten.rsqrt"(%29680) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29681, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29682 = "torch.aten.mul.Tensor"(%29670, %29681) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29682, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29683 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29684 = "torch.prims.convert_element_type"(%29682, %29683) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29684, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29685 = "torch.aten.mul.Tensor"(%17855, %29684) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29685, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29686 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29687 = "torch.prims.convert_element_type"(%29685, %29686) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29687, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29688 = "torch.aten.div.Tensor"(%29687, %17857) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29688, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29689 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29690 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29691 = "torch.aten.clamp"(%29688, %29689, %29690) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29691, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29692 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29693 = "torch.prims.convert_element_type"(%29691, %29692) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29693, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29694 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29695 = "torch.aten.unsqueeze"(%17859, %29694) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %29696 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29697 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %29698 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29699 = "torch.prim.ListConstruct"(%29696, %29697, %29698) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29700 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29701 = "torch.aten.expand"(%29695, %29699, %29700) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %29702 = "torch_c.to_builtin_tensor"(%29693) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29703 = "torch_c.to_builtin_tensor"(%29701) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %29704 = "util.call"(%29702, %29703) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %29705 = "torch_c.from_builtin_tensor"(%29704) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%29705, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %29706 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29707 = "torch.prims.convert_element_type"(%29705, %29706) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29707, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29708 = "torch.aten.silu"(%29707) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29708, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29709 = "torch.aten.div.Tensor"(%29687, %17861) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29709, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29710 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29711 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29712 = "torch.aten.clamp"(%29709, %29710, %29711) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29712, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29713 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29714 = "torch.prims.convert_element_type"(%29712, %29713) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29714, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29715 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29716 = "torch.aten.unsqueeze"(%17863, %29715) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %29717 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29718 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %29719 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29720 = "torch.prim.ListConstruct"(%29717, %29718, %29719) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29721 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29722 = "torch.aten.expand"(%29716, %29720, %29721) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %29723 = "torch_c.to_builtin_tensor"(%29714) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29724 = "torch_c.to_builtin_tensor"(%29722) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %29725 = "util.call"(%29723, %29724) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %29726 = "torch_c.from_builtin_tensor"(%29725) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%29726, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %29727 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29728 = "torch.prims.convert_element_type"(%29726, %29727) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29728, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29729 = "torch.aten.mul.Tensor"(%29708, %29728) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29729, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29730 = "torch.aten.div.Tensor"(%29729, %17865) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29730, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29731 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29732 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29733 = "torch.aten.clamp"(%29730, %29731, %29732) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%29733, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %29734 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29735 = "torch.prims.convert_element_type"(%29733, %29734) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29735, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %29736 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29737 = "torch.aten.unsqueeze"(%17867, %29736) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %29738 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29739 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29740 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %29741 = "torch.prim.ListConstruct"(%29738, %29739, %29740) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29742 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29743 = "torch.aten.expand"(%29737, %29741, %29742) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %29744 = "torch_c.to_builtin_tensor"(%29735) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %29745 = "torch_c.to_builtin_tensor"(%29743) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %29746 = "util.call"(%29744, %29745) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %29747 = "torch_c.from_builtin_tensor"(%29746) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29747, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29748 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29749 = "torch.prims.convert_element_type"(%29747, %29748) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29749, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29751 = "torch.aten.add.Tensor"(%29668, %29749, %29750) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29751, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29752 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29753 = "torch.prims.convert_element_type"(%29751, %29752) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29753, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29754 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29755 = "torch.aten.pow.Tensor_Scalar"(%29753, %29754) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29755, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29756 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29757 = "torch.prim.ListConstruct"(%29756) : (!torch.int) -> !torch.list<int>
    %29758 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %29759 = "torch.constant.none"() : () -> !torch.none
    %29760 = "torch.aten.mean.dim"(%29755, %29757, %29758, %29759) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29760, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29761 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %29762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29763 = "torch.aten.add.Scalar"(%29760, %29761, %29762) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29763, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29764 = "torch.aten.rsqrt"(%29763) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%29764, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %29765 = "torch.aten.mul.Tensor"(%29753, %29764) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29765, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29766 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29767 = "torch.prims.convert_element_type"(%29765, %29766) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29767, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29768 = "torch.aten.mul.Tensor"(%17869, %29767) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29768, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29769 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29770 = "torch.prims.convert_element_type"(%29768, %29769) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29770, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29771 = "torch.aten.div.Tensor"(%29770, %17871) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29771, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29772 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29773 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29774 = "torch.aten.clamp"(%29771, %29772, %29773) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29774, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29775 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29776 = "torch.prims.convert_element_type"(%29774, %29775) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29776, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29777 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29778 = "torch.aten.unsqueeze"(%17873, %29777) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %29779 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29780 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29781 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29782 = "torch.prim.ListConstruct"(%29779, %29780, %29781) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29783 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29784 = "torch.aten.expand"(%29778, %29782, %29783) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %29785 = "torch_c.to_builtin_tensor"(%29776) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29786 = "torch_c.to_builtin_tensor"(%29784) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %29787 = "util.call"(%29785, %29786) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %29788 = "torch_c.from_builtin_tensor"(%29787) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29788, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29789 = "torch.aten.div.Tensor"(%29788, %17875) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29789, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29790 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29791 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29792 = "torch.aten.clamp"(%29789, %29790, %29791) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%29792, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %29793 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29794 = "torch.prims.convert_element_type"(%29792, %29793) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29794, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29795 = "torch.aten.div.Tensor"(%29770, %17877) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29795, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29796 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29797 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29798 = "torch.aten.clamp"(%29795, %29796, %29797) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29798, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29799 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29800 = "torch.prims.convert_element_type"(%29798, %29799) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29800, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29801 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29802 = "torch.aten.unsqueeze"(%17879, %29801) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %29803 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29804 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %29805 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29806 = "torch.prim.ListConstruct"(%29803, %29804, %29805) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29807 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29808 = "torch.aten.expand"(%29802, %29806, %29807) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %29809 = "torch_c.to_builtin_tensor"(%29800) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29810 = "torch_c.to_builtin_tensor"(%29808) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %29811 = "util.call"(%29809, %29810) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %29812 = "torch_c.from_builtin_tensor"(%29811) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29812, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29813 = "torch.aten.div.Tensor"(%29812, %17881) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29813, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29814 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29815 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29816 = "torch.aten.clamp"(%29813, %29814, %29815) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29816, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29817 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29818 = "torch.prims.convert_element_type"(%29816, %29817) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29818, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %29819 = "torch.aten.div.Tensor"(%29770, %17883) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29819, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29820 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29821 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29822 = "torch.aten.clamp"(%29819, %29820, %29821) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%29822, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %29823 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29824 = "torch.prims.convert_element_type"(%29822, %29823) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29824, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %29825 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29826 = "torch.aten.unsqueeze"(%17885, %29825) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %29827 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29828 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %29829 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %29830 = "torch.prim.ListConstruct"(%29827, %29828, %29829) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29831 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29832 = "torch.aten.expand"(%29826, %29830, %29831) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %29833 = "torch_c.to_builtin_tensor"(%29824) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %29834 = "torch_c.to_builtin_tensor"(%29832) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %29835 = "util.call"(%29833, %29834) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %29836 = "torch_c.from_builtin_tensor"(%29835) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29836, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29837 = "torch.aten.div.Tensor"(%29836, %17887) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29837, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29838 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %29839 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %29840 = "torch.aten.clamp"(%29837, %29838, %29839) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%29840, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %29841 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %29842 = "torch.prims.convert_element_type"(%29840, %29841) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29842, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %29843 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29844 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %29845 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29846 = "torch.prim.ListConstruct"(%29843, %18481, %29844, %29845) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29847 = "torch.aten.view"(%29794, %29846) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29847, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29848 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29849 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29850 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29851 = "torch.prim.ListConstruct"(%29848, %18481, %29849, %29850) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29852 = "torch.aten.view"(%29818, %29851) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29852, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29853 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29854 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29855 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29856 = "torch.prim.ListConstruct"(%29853, %18481, %29854, %29855) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29857 = "torch.aten.view"(%29842, %29856) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29857, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29858 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %29859 = "torch.constant.none"() : () -> !torch.none
    %29860 = "torch.constant.none"() : () -> !torch.none
    %29861 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %29862 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29863 = "torch.aten.arange"(%29858, %29859, %29860, %29861, %29862) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %29864 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29865 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29866 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29867 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29868 = "torch.constant.none"() : () -> !torch.none
    %29869 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %29870 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %29871 = "torch.aten.arange.start_step"(%29864, %29865, %29866, %29867, %29868, %29869, %29870) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %29872 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29873 = "torch.prims.convert_element_type"(%29871, %29872) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %29874 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %29875 = "torch.aten.div.Scalar"(%29873, %29874) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29876 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %29877 = "torch.aten.pow.Scalar"(%29876, %29875) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29878 = "torch.aten.reciprocal"(%29877) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29879 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %29880 = "torch.aten.mul.Scalar"(%29878, %29879) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %29881 = "torch.aten.reciprocal"(%29880) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29882 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %29883 = "torch.aten.mul.Scalar"(%29881, %29882) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %29884 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %29885 = "torch.aten.gt.Scalar"(%29883, %29884) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29886 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29887 = "torch.aten.div.Scalar"(%29880, %29886) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29888 = "torch.aten.where.self"(%29885, %29887, %29880) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29889 = "torch.aten.reciprocal"(%29883) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29890 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %29891 = "torch.aten.mul.Scalar"(%29889, %29890) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29892 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29893 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29894 = "torch.aten.sub.Scalar"(%29891, %29892, %29893) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %29895 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29896 = "torch.aten.div.Scalar"(%29894, %29895) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29897 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29898 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29899 = "torch.aten.rsub.Scalar"(%29896, %29897, %29898) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %29900 = "torch.aten.mul.Tensor"(%29899, %29888) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29901 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %29902 = "torch.aten.div.Scalar"(%29900, %29901) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29903 = "torch.aten.mul.Tensor"(%29896, %29888) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29904 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29905 = "torch.aten.add.Tensor"(%29902, %29903, %29904) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %29906 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %29907 = "torch.aten.lt.Scalar"(%29883, %29906) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29908 = "torch.aten.bitwise_not"(%29907) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29909 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %29910 = "torch.aten.gt.Scalar"(%29883, %29909) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %29911 = "torch.aten.bitwise_not"(%29910) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29912 = "torch.aten.mul.Tensor"(%29908, %29911) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %29913 = "torch.aten.where.self"(%29912, %29905, %29888) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %29914 = "torch.prim.ListConstruct"(%29913, %29913) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %29915 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %29916 = "torch.aten.cat"(%29914, %29915) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %29917 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29918 = "torch.prims.convert_element_type"(%29863, %29917) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %29919 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %29920 = "torch.prims.convert_element_type"(%29916, %29919) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %29921 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %29922 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29923 = "torch.prim.ListConstruct"(%29921, %29922) : (!torch.int, !torch.int) -> !torch.list<int>
    %29924 = "torch.aten.view"(%29918, %29923) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %29925 = "torch.aten.mul.Tensor"(%29924, %29920) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29926 = "torch.aten.cos"(%29925) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29927 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29928 = "torch.prims.convert_element_type"(%29926, %29927) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %29929 = "torch.aten.sin"(%29925) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %29930 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %29931 = "torch.prims.convert_element_type"(%29929, %29930) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %29932 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29933 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29935 = "torch.aten.slice.Tensor"(%29928, %29932, %29933, %18481, %29934) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29935, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29936 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29937 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29938 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29939 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29940 = "torch.aten.slice.Tensor"(%29935, %29936, %29937, %29938, %29939) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29940, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29941 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29942 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29943 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29944 = "torch.aten.slice.Tensor"(%29931, %29941, %29942, %18481, %29943) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29944, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29945 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29946 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29947 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29948 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29949 = "torch.aten.slice.Tensor"(%29944, %29945, %29946, %29947, %29948) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%29949, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %29950 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29951 = "torch.aten.unsqueeze"(%29940, %29950) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29951, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29952 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29953 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29954 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29955 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29956 = "torch.aten.slice.Tensor"(%29951, %29952, %29953, %29954, %29955) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29956, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29957 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29958 = "torch.aten.unsqueeze"(%29956, %29957) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29958, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29959 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29960 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29961 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29962 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29963 = "torch.aten.slice.Tensor"(%29958, %29959, %29960, %29961, %29962) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29963, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29964 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29965 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29966 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29967 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29968 = "torch.prim.ListConstruct"(%29964, %29965, %29966, %29967) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29969 = "torch.aten.repeat"(%29963, %29968) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29969, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %29970 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29971 = "torch.aten.unsqueeze"(%29949, %29970) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29971, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29972 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29973 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29974 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29975 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29976 = "torch.aten.slice.Tensor"(%29971, %29972, %29973, %29974, %29975) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%29976, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %29977 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %29978 = "torch.aten.unsqueeze"(%29976, %29977) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29978, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29979 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29980 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29981 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29982 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29983 = "torch.aten.slice.Tensor"(%29978, %29979, %29980, %29981, %29982) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29983, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %29984 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %29985 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29986 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29987 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29988 = "torch.prim.ListConstruct"(%29984, %29985, %29986, %29987) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29989 = "torch.aten.repeat"(%29983, %29988) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%29989, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %29990 = "torch.aten.mul.Tensor"(%29847, %29969) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29990, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %29991 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29992 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %29993 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %29994 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %29995 = "torch.aten.slice.Tensor"(%29847, %29991, %29992, %29993, %29994) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%29995, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %29996 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %29997 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %29998 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %29999 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30000 = "torch.aten.slice.Tensor"(%29847, %29996, %29997, %29998, %29999) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30000, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30001 = "torch.aten.neg"(%30000) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30001, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30002 = "torch.prim.ListConstruct"(%30001, %29995) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %30003 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30004 = "torch.aten.cat"(%30002, %30003) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30004, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30005 = "torch.aten.mul.Tensor"(%30004, %29989) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30005, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30006 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30007 = "torch.aten.add.Tensor"(%29990, %30005, %30006) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30007, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30008 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %30009 = "torch.constant.none"() : () -> !torch.none
    %30010 = "torch.constant.none"() : () -> !torch.none
    %30011 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %30012 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30013 = "torch.aten.arange"(%30008, %30009, %30010, %30011, %30012) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %30014 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30015 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30016 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30017 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30018 = "torch.constant.none"() : () -> !torch.none
    %30019 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %30020 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30021 = "torch.aten.arange.start_step"(%30014, %30015, %30016, %30017, %30018, %30019, %30020) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %30022 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30023 = "torch.prims.convert_element_type"(%30021, %30022) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %30024 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30025 = "torch.aten.div.Scalar"(%30023, %30024) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30026 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %30027 = "torch.aten.pow.Scalar"(%30026, %30025) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30028 = "torch.aten.reciprocal"(%30027) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30029 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %30030 = "torch.aten.mul.Scalar"(%30028, %30029) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %30031 = "torch.aten.reciprocal"(%30030) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30032 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %30033 = "torch.aten.mul.Scalar"(%30031, %30032) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %30034 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %30035 = "torch.aten.gt.Scalar"(%30033, %30034) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30036 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30037 = "torch.aten.div.Scalar"(%30030, %30036) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30038 = "torch.aten.where.self"(%30035, %30037, %30030) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30039 = "torch.aten.reciprocal"(%30033) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30040 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %30041 = "torch.aten.mul.Scalar"(%30039, %30040) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30042 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30043 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30044 = "torch.aten.sub.Scalar"(%30041, %30042, %30043) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %30045 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30046 = "torch.aten.div.Scalar"(%30044, %30045) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30047 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30048 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30049 = "torch.aten.rsub.Scalar"(%30046, %30047, %30048) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %30050 = "torch.aten.mul.Tensor"(%30049, %30038) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30051 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30052 = "torch.aten.div.Scalar"(%30050, %30051) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30053 = "torch.aten.mul.Tensor"(%30046, %30038) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30054 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30055 = "torch.aten.add.Tensor"(%30052, %30053, %30054) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30056 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %30057 = "torch.aten.lt.Scalar"(%30033, %30056) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30058 = "torch.aten.bitwise_not"(%30057) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30059 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %30060 = "torch.aten.gt.Scalar"(%30033, %30059) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30061 = "torch.aten.bitwise_not"(%30060) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30062 = "torch.aten.mul.Tensor"(%30058, %30061) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30063 = "torch.aten.where.self"(%30062, %30055, %30038) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30064 = "torch.prim.ListConstruct"(%30063, %30063) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %30065 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30066 = "torch.aten.cat"(%30064, %30065) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %30067 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30068 = "torch.prims.convert_element_type"(%30013, %30067) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %30069 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30070 = "torch.prims.convert_element_type"(%30066, %30069) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %30071 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %30072 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30073 = "torch.prim.ListConstruct"(%30071, %30072) : (!torch.int, !torch.int) -> !torch.list<int>
    %30074 = "torch.aten.view"(%30068, %30073) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %30075 = "torch.aten.mul.Tensor"(%30074, %30070) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30076 = "torch.aten.cos"(%30075) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30077 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30078 = "torch.prims.convert_element_type"(%30076, %30077) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %30079 = "torch.aten.sin"(%30075) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30080 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30081 = "torch.prims.convert_element_type"(%30079, %30080) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %30082 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30083 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30084 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30085 = "torch.aten.slice.Tensor"(%30078, %30082, %30083, %18481, %30084) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30085, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30086 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30087 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30088 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30089 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30090 = "torch.aten.slice.Tensor"(%30085, %30086, %30087, %30088, %30089) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30090, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30091 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30092 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30093 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30094 = "torch.aten.slice.Tensor"(%30081, %30091, %30092, %18481, %30093) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30094, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30095 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30096 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30097 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30098 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30099 = "torch.aten.slice.Tensor"(%30094, %30095, %30096, %30097, %30098) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30099, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30100 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30101 = "torch.aten.unsqueeze"(%30090, %30100) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30101, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30102 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30103 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30104 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30105 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30106 = "torch.aten.slice.Tensor"(%30101, %30102, %30103, %30104, %30105) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30106, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30107 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30108 = "torch.aten.unsqueeze"(%30106, %30107) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30108, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30109 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30110 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30111 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30112 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30113 = "torch.aten.slice.Tensor"(%30108, %30109, %30110, %30111, %30112) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30113, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30114 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30115 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30116 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30117 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30118 = "torch.prim.ListConstruct"(%30114, %30115, %30116, %30117) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30119 = "torch.aten.repeat"(%30113, %30118) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30119, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %30120 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30121 = "torch.aten.unsqueeze"(%30099, %30120) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30121, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30122 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30123 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30124 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30125 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30126 = "torch.aten.slice.Tensor"(%30121, %30122, %30123, %30124, %30125) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30126, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30127 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30128 = "torch.aten.unsqueeze"(%30126, %30127) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30128, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30129 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30130 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30131 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30132 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30133 = "torch.aten.slice.Tensor"(%30128, %30129, %30130, %30131, %30132) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30133, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30134 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30135 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30136 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30137 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30138 = "torch.prim.ListConstruct"(%30134, %30135, %30136, %30137) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30139 = "torch.aten.repeat"(%30133, %30138) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30139, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %30140 = "torch.aten.mul.Tensor"(%29852, %30119) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30140, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30141 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30142 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30143 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30144 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30145 = "torch.aten.slice.Tensor"(%29852, %30141, %30142, %30143, %30144) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30145, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30146 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30147 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30148 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30149 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30150 = "torch.aten.slice.Tensor"(%29852, %30146, %30147, %30148, %30149) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30150, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30151 = "torch.aten.neg"(%30150) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30151, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30152 = "torch.prim.ListConstruct"(%30151, %30145) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %30153 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30154 = "torch.aten.cat"(%30152, %30153) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30154, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30155 = "torch.aten.mul.Tensor"(%30154, %30139) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30155, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30156 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30157 = "torch.aten.add.Tensor"(%30140, %30155, %30156) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30157, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30158 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30159 = "torch.aten.mul.Scalar"(%arg69, %30158) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%30159, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %30160 = "torch.constant.int"() <{value = 34 : i64}> : () -> !torch.int
    %30161 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30162 = "torch.aten.add.Scalar"(%30159, %30160, %30161) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%30162, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %30163 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30164 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30165 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30166 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30167 = "torch.prim.ListConstruct"(%30163, %18477, %30164, %30165, %30166) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30168 = "torch.aten.view"(%30157, %30167) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30168, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30169 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30170 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30171 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30172 = "torch.prim.ListConstruct"(%19011, %30169, %30170, %30171) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30173 = "torch.aten.view"(%30168, %30172) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30173, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30174 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %30175 = "torch.aten.view"(%30162, %30174) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%30175, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %30176 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30177 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30178 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30179 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30180 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30181 = "torch.prim.ListConstruct"(%18479, %30176, %30177, %30178, %30179, %30180) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30182 = "torch.aten.view"(%29584, %30181) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30182, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30183 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30184 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30185 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30186 = "torch.prim.ListConstruct"(%18993, %30183, %30184, %30185) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30187 = "torch.aten.view"(%30182, %30186) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30187, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30188 = "torch.prim.ListConstruct"(%30175) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %30189 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30190 = "torch.aten.index_put"(%30187, %30188, %30173, %30189) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30190, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30191 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30192 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30193 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30194 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30195 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30196 = "torch.prim.ListConstruct"(%18479, %30191, %30192, %30193, %30194, %30195) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30197 = "torch.aten.view"(%30190, %30196) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30197, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30198 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %30199 = "torch.prim.ListConstruct"(%18479, %30198) : (!torch.int, !torch.int) -> !torch.list<int>
    %30200 = "torch.aten.view"(%30197, %30199) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30200, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %30201 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30202 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30203 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30204 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30205 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30206 = "torch.prim.ListConstruct"(%18479, %30201, %30202, %30203, %30204, %30205) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30207 = "torch.aten.view"(%30200, %30206) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30207, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30208 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30209 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30210 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30211 = "torch.prim.ListConstruct"(%18993, %30208, %30209, %30210) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30212 = "torch.aten.view"(%30207, %30211) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30212, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30213 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30214 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30215 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30216 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30217 = "torch.prim.ListConstruct"(%30213, %18477, %30214, %30215, %30216) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30218 = "torch.aten.view"(%29857, %30217) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30218, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30219 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30220 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30221 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30222 = "torch.prim.ListConstruct"(%19011, %30219, %30220, %30221) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30223 = "torch.aten.view"(%30218, %30222) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30223, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30224 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30225 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30226 = "torch.aten.add.Scalar"(%30162, %30224, %30225) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%30226, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %30227 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %30228 = "torch.aten.view"(%30226, %30227) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%30228, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %30229 = "torch.prim.ListConstruct"(%30228) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %30230 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30231 = "torch.aten.index_put"(%30212, %30229, %30223, %30230) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30231, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30232 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30233 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30234 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30235 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30236 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30237 = "torch.prim.ListConstruct"(%18479, %30232, %30233, %30234, %30235, %30236) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30238 = "torch.aten.view"(%30231, %30237) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30238, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30239 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %30240 = "torch.prim.ListConstruct"(%18479, %30239) : (!torch.int, !torch.int) -> !torch.list<int>
    %30241 = "torch.aten.view"(%30238, %30240) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30241, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %30242 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %30243 = "torch.aten.unsqueeze"(%30157, %30242) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30243, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30244 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30245 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30246 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30247 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30248 = "torch.prim.ListConstruct"(%30244, %18481, %30245, %30246, %30247) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30249 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30250 = "torch.aten.expand"(%30243, %30248, %30249) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30250, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30251 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30252 = "torch.aten.clone"(%30250, %30251) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30252, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30253 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30254 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30255 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30256 = "torch.prim.ListConstruct"(%30253, %18481, %30254, %30255) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30257 = "torch.aten._unsafe_view"(%30252, %30256) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30257, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30258 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %30259 = "torch.aten.unsqueeze"(%29857, %30258) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30259, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30260 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30261 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30262 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30263 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30264 = "torch.prim.ListConstruct"(%30260, %18481, %30261, %30262, %30263) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30265 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30266 = "torch.aten.expand"(%30259, %30264, %30265) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30266, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30267 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30268 = "torch.aten.clone"(%30266, %30267) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30268, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30269 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30270 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30271 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30272 = "torch.prim.ListConstruct"(%30269, %18481, %30270, %30271) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30273 = "torch.aten._unsafe_view"(%30268, %30272) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30273, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30274 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30275 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30276 = "torch.aten.transpose.int"(%30007, %30274, %30275) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30276, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30278 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30279 = "torch.aten.transpose.int"(%30257, %30277, %30278) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30279, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30280 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30281 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30282 = "torch.aten.transpose.int"(%30273, %30280, %30281) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30282, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30283 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30284 = "torch.aten.squeeze.dim"(%18570, %30283) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30284, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %30285 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30286 = "torch.aten.squeeze.dim"(%30284, %30285) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30286, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %30287 = "torch_c.to_builtin_tensor"(%30276) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %30288 = "torch_c.to_builtin_tensor"(%30279) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %30289 = "torch_c.to_builtin_tensor"(%30282) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %30290 = "torch_c.to_builtin_tensor"(%30286) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %30291 = "tensor.cast"(%30290) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %30292 = "torch_c.to_builtin_tensor"(%17889) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %30293 = "util.call"(%30287, %30288, %30289, %30292, %30291) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %30294 = "torch_c.from_builtin_tensor"(%30293) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%30294, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %30295 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30296 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30297 = "torch.aten.transpose.int"(%30294, %30295, %30296) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%30297, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %30298 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30299 = "torch.aten.clone"(%30297, %30298) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%30299, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %30300 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30301 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30302 = "torch.prim.ListConstruct"(%30300, %18481, %30301) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30303 = "torch.aten._unsafe_view"(%30299, %30302) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30303, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30304 = "torch.aten.div.Tensor"(%30303, %17891) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30304, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30305 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30306 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30307 = "torch.aten.clamp"(%30304, %30305, %30306) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30307, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30308 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30309 = "torch.prims.convert_element_type"(%30307, %30308) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30309, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30310 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30311 = "torch.aten.unsqueeze"(%17893, %30310) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %30312 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30313 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30314 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30315 = "torch.prim.ListConstruct"(%30312, %30313, %30314) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30316 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30317 = "torch.aten.expand"(%30311, %30315, %30316) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %30318 = "torch_c.to_builtin_tensor"(%30309) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %30319 = "torch_c.to_builtin_tensor"(%30317) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %30320 = "util.call"(%30318, %30319) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %30321 = "torch_c.from_builtin_tensor"(%30320) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30321, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30322 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30323 = "torch.prims.convert_element_type"(%30321, %30322) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30323, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30324 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30325 = "torch.aten.add.Tensor"(%29751, %30323, %30324) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30325, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30326 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30327 = "torch.prims.convert_element_type"(%30325, %30326) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30327, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30328 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30329 = "torch.aten.pow.Tensor_Scalar"(%30327, %30328) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30329, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30330 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30331 = "torch.prim.ListConstruct"(%30330) : (!torch.int) -> !torch.list<int>
    %30332 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %30333 = "torch.constant.none"() : () -> !torch.none
    %30334 = "torch.aten.mean.dim"(%30329, %30331, %30332, %30333) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30334, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30335 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %30336 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30337 = "torch.aten.add.Scalar"(%30334, %30335, %30336) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30337, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30338 = "torch.aten.rsqrt"(%30337) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30338, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30339 = "torch.aten.mul.Tensor"(%30327, %30338) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30339, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30340 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30341 = "torch.prims.convert_element_type"(%30339, %30340) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30341, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30342 = "torch.aten.mul.Tensor"(%17895, %30341) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30342, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30343 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30344 = "torch.prims.convert_element_type"(%30342, %30343) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30344, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30345 = "torch.aten.div.Tensor"(%30344, %17897) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30345, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30346 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30347 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30348 = "torch.aten.clamp"(%30345, %30346, %30347) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30348, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30349 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30350 = "torch.prims.convert_element_type"(%30348, %30349) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30350, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30351 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30352 = "torch.aten.unsqueeze"(%17899, %30351) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %30353 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30354 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %30355 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30356 = "torch.prim.ListConstruct"(%30353, %30354, %30355) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30357 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30358 = "torch.aten.expand"(%30352, %30356, %30357) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %30359 = "torch_c.to_builtin_tensor"(%30350) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %30360 = "torch_c.to_builtin_tensor"(%30358) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %30361 = "util.call"(%30359, %30360) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %30362 = "torch_c.from_builtin_tensor"(%30361) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%30362, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %30363 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30364 = "torch.prims.convert_element_type"(%30362, %30363) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%30364, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %30365 = "torch.aten.silu"(%30364) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%30365, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %30366 = "torch.aten.div.Tensor"(%30344, %17901) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30366, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30367 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30368 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30369 = "torch.aten.clamp"(%30366, %30367, %30368) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30369, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30370 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30371 = "torch.prims.convert_element_type"(%30369, %30370) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30371, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30372 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30373 = "torch.aten.unsqueeze"(%17903, %30372) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %30374 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30375 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %30376 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30377 = "torch.prim.ListConstruct"(%30374, %30375, %30376) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30378 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30379 = "torch.aten.expand"(%30373, %30377, %30378) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %30380 = "torch_c.to_builtin_tensor"(%30371) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %30381 = "torch_c.to_builtin_tensor"(%30379) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %30382 = "util.call"(%30380, %30381) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %30383 = "torch_c.from_builtin_tensor"(%30382) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%30383, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %30384 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30385 = "torch.prims.convert_element_type"(%30383, %30384) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%30385, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %30386 = "torch.aten.mul.Tensor"(%30365, %30385) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%30386, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %30387 = "torch.aten.div.Tensor"(%30386, %17905) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%30387, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %30388 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30389 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30390 = "torch.aten.clamp"(%30387, %30388, %30389) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%30390, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %30391 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30392 = "torch.prims.convert_element_type"(%30390, %30391) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30392, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %30393 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30394 = "torch.aten.unsqueeze"(%17907, %30393) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %30395 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30396 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30397 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %30398 = "torch.prim.ListConstruct"(%30395, %30396, %30397) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30399 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30400 = "torch.aten.expand"(%30394, %30398, %30399) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %30401 = "torch_c.to_builtin_tensor"(%30392) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %30402 = "torch_c.to_builtin_tensor"(%30400) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %30403 = "util.call"(%30401, %30402) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %30404 = "torch_c.from_builtin_tensor"(%30403) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30404, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30405 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30406 = "torch.prims.convert_element_type"(%30404, %30405) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30406, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30408 = "torch.aten.add.Tensor"(%30325, %30406, %30407) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30408, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30409 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30410 = "torch.prims.convert_element_type"(%30408, %30409) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30410, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30411 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30412 = "torch.aten.pow.Tensor_Scalar"(%30410, %30411) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30412, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30413 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30414 = "torch.prim.ListConstruct"(%30413) : (!torch.int) -> !torch.list<int>
    %30415 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %30416 = "torch.constant.none"() : () -> !torch.none
    %30417 = "torch.aten.mean.dim"(%30412, %30414, %30415, %30416) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30417, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30418 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %30419 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30420 = "torch.aten.add.Scalar"(%30417, %30418, %30419) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30420, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30421 = "torch.aten.rsqrt"(%30420) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30421, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30422 = "torch.aten.mul.Tensor"(%30410, %30421) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30422, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30423 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30424 = "torch.prims.convert_element_type"(%30422, %30423) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30424, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30425 = "torch.aten.mul.Tensor"(%17909, %30424) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30425, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30426 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30427 = "torch.prims.convert_element_type"(%30425, %30426) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30427, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30428 = "torch.aten.div.Tensor"(%30427, %17911) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30428, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30429 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30430 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30431 = "torch.aten.clamp"(%30428, %30429, %30430) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30431, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30432 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30433 = "torch.prims.convert_element_type"(%30431, %30432) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30433, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30434 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30435 = "torch.aten.unsqueeze"(%17913, %30434) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %30436 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30437 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30438 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30439 = "torch.prim.ListConstruct"(%30436, %30437, %30438) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30440 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30441 = "torch.aten.expand"(%30435, %30439, %30440) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %30442 = "torch_c.to_builtin_tensor"(%30433) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %30443 = "torch_c.to_builtin_tensor"(%30441) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %30444 = "util.call"(%30442, %30443) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %30445 = "torch_c.from_builtin_tensor"(%30444) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30445, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30446 = "torch.aten.div.Tensor"(%30445, %17915) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30446, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30447 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30448 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30449 = "torch.aten.clamp"(%30446, %30447, %30448) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30449, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30450 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30451 = "torch.prims.convert_element_type"(%30449, %30450) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30451, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30452 = "torch.aten.div.Tensor"(%30427, %17917) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30452, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30453 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30454 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30455 = "torch.aten.clamp"(%30452, %30453, %30454) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30455, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30456 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30457 = "torch.prims.convert_element_type"(%30455, %30456) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30457, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30458 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30459 = "torch.aten.unsqueeze"(%17919, %30458) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %30460 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30461 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %30462 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30463 = "torch.prim.ListConstruct"(%30460, %30461, %30462) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30464 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30465 = "torch.aten.expand"(%30459, %30463, %30464) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %30466 = "torch_c.to_builtin_tensor"(%30457) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %30467 = "torch_c.to_builtin_tensor"(%30465) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %30468 = "util.call"(%30466, %30467) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %30469 = "torch_c.from_builtin_tensor"(%30468) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%30469, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %30470 = "torch.aten.div.Tensor"(%30469, %17921) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%30470, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %30471 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30472 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30473 = "torch.aten.clamp"(%30470, %30471, %30472) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%30473, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %30474 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30475 = "torch.prims.convert_element_type"(%30473, %30474) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30475, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %30476 = "torch.aten.div.Tensor"(%30427, %17923) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30476, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30477 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30478 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30479 = "torch.aten.clamp"(%30476, %30477, %30478) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30479, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30480 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30481 = "torch.prims.convert_element_type"(%30479, %30480) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30481, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30482 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30483 = "torch.aten.unsqueeze"(%17925, %30482) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %30484 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30485 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %30486 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30487 = "torch.prim.ListConstruct"(%30484, %30485, %30486) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30488 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30489 = "torch.aten.expand"(%30483, %30487, %30488) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %30490 = "torch_c.to_builtin_tensor"(%30481) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %30491 = "torch_c.to_builtin_tensor"(%30489) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %30492 = "util.call"(%30490, %30491) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %30493 = "torch_c.from_builtin_tensor"(%30492) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%30493, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %30494 = "torch.aten.div.Tensor"(%30493, %17927) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%30494, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %30495 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30496 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30497 = "torch.aten.clamp"(%30494, %30495, %30496) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%30497, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %30498 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30499 = "torch.prims.convert_element_type"(%30497, %30498) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30499, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %30500 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30501 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30502 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30503 = "torch.prim.ListConstruct"(%30500, %18481, %30501, %30502) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30504 = "torch.aten.view"(%30451, %30503) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30504, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30505 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30506 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30507 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30508 = "torch.prim.ListConstruct"(%30505, %18481, %30506, %30507) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30509 = "torch.aten.view"(%30475, %30508) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30509, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30510 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30511 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30512 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30513 = "torch.prim.ListConstruct"(%30510, %18481, %30511, %30512) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30514 = "torch.aten.view"(%30499, %30513) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30514, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30515 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %30516 = "torch.constant.none"() : () -> !torch.none
    %30517 = "torch.constant.none"() : () -> !torch.none
    %30518 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %30519 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30520 = "torch.aten.arange"(%30515, %30516, %30517, %30518, %30519) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %30521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30522 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30523 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30524 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30525 = "torch.constant.none"() : () -> !torch.none
    %30526 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %30527 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30528 = "torch.aten.arange.start_step"(%30521, %30522, %30523, %30524, %30525, %30526, %30527) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %30529 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30530 = "torch.prims.convert_element_type"(%30528, %30529) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %30531 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30532 = "torch.aten.div.Scalar"(%30530, %30531) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30533 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %30534 = "torch.aten.pow.Scalar"(%30533, %30532) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30535 = "torch.aten.reciprocal"(%30534) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30536 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %30537 = "torch.aten.mul.Scalar"(%30535, %30536) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %30538 = "torch.aten.reciprocal"(%30537) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30539 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %30540 = "torch.aten.mul.Scalar"(%30538, %30539) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %30541 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %30542 = "torch.aten.gt.Scalar"(%30540, %30541) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30543 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30544 = "torch.aten.div.Scalar"(%30537, %30543) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30545 = "torch.aten.where.self"(%30542, %30544, %30537) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30546 = "torch.aten.reciprocal"(%30540) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30547 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %30548 = "torch.aten.mul.Scalar"(%30546, %30547) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30549 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30550 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30551 = "torch.aten.sub.Scalar"(%30548, %30549, %30550) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %30552 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30553 = "torch.aten.div.Scalar"(%30551, %30552) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30554 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30555 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30556 = "torch.aten.rsub.Scalar"(%30553, %30554, %30555) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %30557 = "torch.aten.mul.Tensor"(%30556, %30545) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30558 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30559 = "torch.aten.div.Scalar"(%30557, %30558) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30560 = "torch.aten.mul.Tensor"(%30553, %30545) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30561 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30562 = "torch.aten.add.Tensor"(%30559, %30560, %30561) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30563 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %30564 = "torch.aten.lt.Scalar"(%30540, %30563) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30565 = "torch.aten.bitwise_not"(%30564) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30566 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %30567 = "torch.aten.gt.Scalar"(%30540, %30566) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30568 = "torch.aten.bitwise_not"(%30567) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30569 = "torch.aten.mul.Tensor"(%30565, %30568) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30570 = "torch.aten.where.self"(%30569, %30562, %30545) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30571 = "torch.prim.ListConstruct"(%30570, %30570) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %30572 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30573 = "torch.aten.cat"(%30571, %30572) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %30574 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30575 = "torch.prims.convert_element_type"(%30520, %30574) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %30576 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30577 = "torch.prims.convert_element_type"(%30573, %30576) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %30578 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %30579 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30580 = "torch.prim.ListConstruct"(%30578, %30579) : (!torch.int, !torch.int) -> !torch.list<int>
    %30581 = "torch.aten.view"(%30575, %30580) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %30582 = "torch.aten.mul.Tensor"(%30581, %30577) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30583 = "torch.aten.cos"(%30582) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30584 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30585 = "torch.prims.convert_element_type"(%30583, %30584) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %30586 = "torch.aten.sin"(%30582) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30587 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30588 = "torch.prims.convert_element_type"(%30586, %30587) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %30589 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30590 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30591 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30592 = "torch.aten.slice.Tensor"(%30585, %30589, %30590, %18481, %30591) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30592, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30593 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30594 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30595 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30596 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30597 = "torch.aten.slice.Tensor"(%30592, %30593, %30594, %30595, %30596) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30597, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30598 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30599 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30600 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30601 = "torch.aten.slice.Tensor"(%30588, %30598, %30599, %18481, %30600) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30601, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30602 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30603 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30604 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30605 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30606 = "torch.aten.slice.Tensor"(%30601, %30602, %30603, %30604, %30605) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30606, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30607 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30608 = "torch.aten.unsqueeze"(%30597, %30607) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30608, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30609 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30610 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30611 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30612 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30613 = "torch.aten.slice.Tensor"(%30608, %30609, %30610, %30611, %30612) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30613, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30614 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30615 = "torch.aten.unsqueeze"(%30613, %30614) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30615, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30616 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30617 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30618 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30619 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30620 = "torch.aten.slice.Tensor"(%30615, %30616, %30617, %30618, %30619) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30620, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30621 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30622 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30623 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30624 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30625 = "torch.prim.ListConstruct"(%30621, %30622, %30623, %30624) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30626 = "torch.aten.repeat"(%30620, %30625) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30626, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %30627 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30628 = "torch.aten.unsqueeze"(%30606, %30627) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30628, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30629 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30630 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30631 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30632 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30633 = "torch.aten.slice.Tensor"(%30628, %30629, %30630, %30631, %30632) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30633, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30634 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30635 = "torch.aten.unsqueeze"(%30633, %30634) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30635, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30636 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30637 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30638 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30639 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30640 = "torch.aten.slice.Tensor"(%30635, %30636, %30637, %30638, %30639) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30640, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30641 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30642 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30643 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30644 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30645 = "torch.prim.ListConstruct"(%30641, %30642, %30643, %30644) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30646 = "torch.aten.repeat"(%30640, %30645) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30646, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %30647 = "torch.aten.mul.Tensor"(%30504, %30626) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30647, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30648 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30649 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30650 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30651 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30652 = "torch.aten.slice.Tensor"(%30504, %30648, %30649, %30650, %30651) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30652, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30653 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30654 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30655 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30656 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30657 = "torch.aten.slice.Tensor"(%30504, %30653, %30654, %30655, %30656) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30657, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30658 = "torch.aten.neg"(%30657) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30658, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30659 = "torch.prim.ListConstruct"(%30658, %30652) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %30660 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30661 = "torch.aten.cat"(%30659, %30660) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30661, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30662 = "torch.aten.mul.Tensor"(%30661, %30646) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30662, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30663 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30664 = "torch.aten.add.Tensor"(%30647, %30662, %30663) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30664, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30665 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %30666 = "torch.constant.none"() : () -> !torch.none
    %30667 = "torch.constant.none"() : () -> !torch.none
    %30668 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %30669 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30670 = "torch.aten.arange"(%30665, %30666, %30667, %30668, %30669) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %30671 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30672 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30673 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30674 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30675 = "torch.constant.none"() : () -> !torch.none
    %30676 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %30677 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30678 = "torch.aten.arange.start_step"(%30671, %30672, %30673, %30674, %30675, %30676, %30677) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %30679 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30680 = "torch.prims.convert_element_type"(%30678, %30679) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %30681 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30682 = "torch.aten.div.Scalar"(%30680, %30681) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30683 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %30684 = "torch.aten.pow.Scalar"(%30683, %30682) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30685 = "torch.aten.reciprocal"(%30684) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30686 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %30687 = "torch.aten.mul.Scalar"(%30685, %30686) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %30688 = "torch.aten.reciprocal"(%30687) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30689 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %30690 = "torch.aten.mul.Scalar"(%30688, %30689) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %30691 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %30692 = "torch.aten.gt.Scalar"(%30690, %30691) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30693 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30694 = "torch.aten.div.Scalar"(%30687, %30693) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30695 = "torch.aten.where.self"(%30692, %30694, %30687) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30696 = "torch.aten.reciprocal"(%30690) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30697 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %30698 = "torch.aten.mul.Scalar"(%30696, %30697) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30699 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30700 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30701 = "torch.aten.sub.Scalar"(%30698, %30699, %30700) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %30702 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30703 = "torch.aten.div.Scalar"(%30701, %30702) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30704 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30705 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30706 = "torch.aten.rsub.Scalar"(%30703, %30704, %30705) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %30707 = "torch.aten.mul.Tensor"(%30706, %30695) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30708 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30709 = "torch.aten.div.Scalar"(%30707, %30708) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30710 = "torch.aten.mul.Tensor"(%30703, %30695) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30711 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30712 = "torch.aten.add.Tensor"(%30709, %30710, %30711) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %30713 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %30714 = "torch.aten.lt.Scalar"(%30690, %30713) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30715 = "torch.aten.bitwise_not"(%30714) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30716 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %30717 = "torch.aten.gt.Scalar"(%30690, %30716) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %30718 = "torch.aten.bitwise_not"(%30717) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30719 = "torch.aten.mul.Tensor"(%30715, %30718) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %30720 = "torch.aten.where.self"(%30719, %30712, %30695) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %30721 = "torch.prim.ListConstruct"(%30720, %30720) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %30722 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30723 = "torch.aten.cat"(%30721, %30722) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %30724 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30725 = "torch.prims.convert_element_type"(%30670, %30724) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %30726 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30727 = "torch.prims.convert_element_type"(%30723, %30726) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %30728 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %30729 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30730 = "torch.prim.ListConstruct"(%30728, %30729) : (!torch.int, !torch.int) -> !torch.list<int>
    %30731 = "torch.aten.view"(%30725, %30730) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %30732 = "torch.aten.mul.Tensor"(%30731, %30727) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30733 = "torch.aten.cos"(%30732) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30734 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30735 = "torch.prims.convert_element_type"(%30733, %30734) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %30736 = "torch.aten.sin"(%30732) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %30737 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30738 = "torch.prims.convert_element_type"(%30736, %30737) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %30739 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30740 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30741 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30742 = "torch.aten.slice.Tensor"(%30735, %30739, %30740, %18481, %30741) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30742, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30743 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30744 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30745 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30746 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30747 = "torch.aten.slice.Tensor"(%30742, %30743, %30744, %30745, %30746) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30747, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30748 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30749 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30751 = "torch.aten.slice.Tensor"(%30738, %30748, %30749, %18481, %30750) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30751, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30752 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30753 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30754 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30755 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30756 = "torch.aten.slice.Tensor"(%30751, %30752, %30753, %30754, %30755) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%30756, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %30757 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30758 = "torch.aten.unsqueeze"(%30747, %30757) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30758, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30759 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30760 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30761 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30763 = "torch.aten.slice.Tensor"(%30758, %30759, %30760, %30761, %30762) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30763, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30764 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30765 = "torch.aten.unsqueeze"(%30763, %30764) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30765, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30766 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30767 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30768 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30769 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30770 = "torch.aten.slice.Tensor"(%30765, %30766, %30767, %30768, %30769) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30770, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30771 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30772 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30773 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30774 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30775 = "torch.prim.ListConstruct"(%30771, %30772, %30773, %30774) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30776 = "torch.aten.repeat"(%30770, %30775) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30776, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %30777 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30778 = "torch.aten.unsqueeze"(%30756, %30777) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30778, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30780 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30781 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30782 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30783 = "torch.aten.slice.Tensor"(%30778, %30779, %30780, %30781, %30782) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%30783, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %30784 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30785 = "torch.aten.unsqueeze"(%30783, %30784) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30785, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30786 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30787 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30788 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30789 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30790 = "torch.aten.slice.Tensor"(%30785, %30786, %30787, %30788, %30789) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30790, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %30791 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30792 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30793 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30794 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30795 = "torch.prim.ListConstruct"(%30791, %30792, %30793, %30794) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30796 = "torch.aten.repeat"(%30790, %30795) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%30796, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %30797 = "torch.aten.mul.Tensor"(%30509, %30776) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30797, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30798 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30799 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30800 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30801 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30802 = "torch.aten.slice.Tensor"(%30509, %30798, %30799, %30800, %30801) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30802, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30803 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %30804 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30805 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %30806 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30807 = "torch.aten.slice.Tensor"(%30509, %30803, %30804, %30805, %30806) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30807, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30808 = "torch.aten.neg"(%30807) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30808, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %30809 = "torch.prim.ListConstruct"(%30808, %30802) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %30810 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30811 = "torch.aten.cat"(%30809, %30810) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30811, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30812 = "torch.aten.mul.Tensor"(%30811, %30796) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30812, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30813 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30814 = "torch.aten.add.Tensor"(%30797, %30812, %30813) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30814, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30815 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %30816 = "torch.aten.mul.Scalar"(%arg69, %30815) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%30816, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %30817 = "torch.constant.int"() <{value = 36 : i64}> : () -> !torch.int
    %30818 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30819 = "torch.aten.add.Scalar"(%30816, %30817, %30818) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%30819, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %30820 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30821 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30822 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30823 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30824 = "torch.prim.ListConstruct"(%30820, %18477, %30821, %30822, %30823) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30825 = "torch.aten.view"(%30814, %30824) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30825, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30826 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30827 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30828 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30829 = "torch.prim.ListConstruct"(%19011, %30826, %30827, %30828) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30830 = "torch.aten.view"(%30825, %30829) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30830, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30831 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %30832 = "torch.aten.view"(%30819, %30831) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%30832, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %30833 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30834 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30835 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30836 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30837 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30838 = "torch.prim.ListConstruct"(%18479, %30833, %30834, %30835, %30836, %30837) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30839 = "torch.aten.view"(%30241, %30838) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30839, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30840 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30841 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30842 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30843 = "torch.prim.ListConstruct"(%18993, %30840, %30841, %30842) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30844 = "torch.aten.view"(%30839, %30843) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30844, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30845 = "torch.prim.ListConstruct"(%30832) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %30846 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30847 = "torch.aten.index_put"(%30844, %30845, %30830, %30846) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30847, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30848 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30849 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30850 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30851 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30852 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30853 = "torch.prim.ListConstruct"(%18479, %30848, %30849, %30850, %30851, %30852) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30854 = "torch.aten.view"(%30847, %30853) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30854, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30855 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %30856 = "torch.prim.ListConstruct"(%18479, %30855) : (!torch.int, !torch.int) -> !torch.list<int>
    %30857 = "torch.aten.view"(%30854, %30856) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30857, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %30858 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30859 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30860 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30861 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30862 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30863 = "torch.prim.ListConstruct"(%18479, %30858, %30859, %30860, %30861, %30862) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30864 = "torch.aten.view"(%30857, %30863) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30864, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30865 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30866 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30867 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30868 = "torch.prim.ListConstruct"(%18993, %30865, %30866, %30867) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30869 = "torch.aten.view"(%30864, %30868) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30869, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30870 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30871 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30872 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30873 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30874 = "torch.prim.ListConstruct"(%30870, %18477, %30871, %30872, %30873) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30875 = "torch.aten.view"(%30514, %30874) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30875, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30876 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30877 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30878 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30879 = "torch.prim.ListConstruct"(%19011, %30876, %30877, %30878) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30880 = "torch.aten.view"(%30875, %30879) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30880, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30881 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30882 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30883 = "torch.aten.add.Scalar"(%30819, %30881, %30882) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%30883, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %30884 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %30885 = "torch.aten.view"(%30883, %30884) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%30885, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %30886 = "torch.prim.ListConstruct"(%30885) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %30887 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30888 = "torch.aten.index_put"(%30869, %30886, %30880, %30887) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30888, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30889 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30890 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30891 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30892 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30893 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30894 = "torch.prim.ListConstruct"(%18479, %30889, %30890, %30891, %30892, %30893) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30895 = "torch.aten.view"(%30888, %30894) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30895, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30896 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %30897 = "torch.prim.ListConstruct"(%18479, %30896) : (!torch.int, !torch.int) -> !torch.list<int>
    %30898 = "torch.aten.view"(%30895, %30897) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30898, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %30899 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %30900 = "torch.aten.unsqueeze"(%30814, %30899) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30900, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30901 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30902 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30903 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30904 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30905 = "torch.prim.ListConstruct"(%30901, %18481, %30902, %30903, %30904) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30906 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30907 = "torch.aten.expand"(%30900, %30905, %30906) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30907, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30908 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30909 = "torch.aten.clone"(%30907, %30908) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30909, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30910 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30911 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30912 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30913 = "torch.prim.ListConstruct"(%30910, %18481, %30911, %30912) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30914 = "torch.aten._unsafe_view"(%30909, %30913) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30914, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30915 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %30916 = "torch.aten.unsqueeze"(%30514, %30915) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30916, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30917 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30918 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %30919 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30920 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30921 = "torch.prim.ListConstruct"(%30917, %18481, %30918, %30919, %30920) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30922 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30923 = "torch.aten.expand"(%30916, %30921, %30922) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30923, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30924 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30925 = "torch.aten.clone"(%30923, %30924) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30925, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30926 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30927 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %30928 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %30929 = "torch.prim.ListConstruct"(%30926, %18481, %30927, %30928) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30930 = "torch.aten._unsafe_view"(%30925, %30929) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30930, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30931 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30932 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30933 = "torch.aten.transpose.int"(%30664, %30931, %30932) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30933, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30935 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30936 = "torch.aten.transpose.int"(%30914, %30934, %30935) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30936, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30937 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30938 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30939 = "torch.aten.transpose.int"(%30930, %30937, %30938) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30939, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %30940 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30941 = "torch.aten.squeeze.dim"(%18570, %30940) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30941, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %30942 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30943 = "torch.aten.squeeze.dim"(%30941, %30942) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30943, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %30944 = "torch_c.to_builtin_tensor"(%30933) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %30945 = "torch_c.to_builtin_tensor"(%30936) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %30946 = "torch_c.to_builtin_tensor"(%30939) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %30947 = "torch_c.to_builtin_tensor"(%30943) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %30948 = "tensor.cast"(%30947) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %30949 = "torch_c.to_builtin_tensor"(%17929) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %30950 = "util.call"(%30944, %30945, %30946, %30949, %30948) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %30951 = "torch_c.from_builtin_tensor"(%30950) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%30951, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %30952 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30953 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30954 = "torch.aten.transpose.int"(%30951, %30952, %30953) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%30954, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %30955 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30956 = "torch.aten.clone"(%30954, %30955) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%30956, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %30957 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30958 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30959 = "torch.prim.ListConstruct"(%30957, %18481, %30958) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30960 = "torch.aten._unsafe_view"(%30956, %30959) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30960, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30961 = "torch.aten.div.Tensor"(%30960, %17931) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30961, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30962 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %30963 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %30964 = "torch.aten.clamp"(%30961, %30962, %30963) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30964, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30965 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %30966 = "torch.prims.convert_element_type"(%30964, %30965) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%30966, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %30967 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %30968 = "torch.aten.unsqueeze"(%17933, %30967) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %30969 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %30970 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30971 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %30972 = "torch.prim.ListConstruct"(%30969, %30970, %30971) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %30973 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %30974 = "torch.aten.expand"(%30968, %30972, %30973) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %30975 = "torch_c.to_builtin_tensor"(%30966) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %30976 = "torch_c.to_builtin_tensor"(%30974) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %30977 = "util.call"(%30975, %30976) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %30978 = "torch_c.from_builtin_tensor"(%30977) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30978, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30979 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30980 = "torch.prims.convert_element_type"(%30978, %30979) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30980, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30981 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30982 = "torch.aten.add.Tensor"(%30408, %30980, %30981) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30982, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30983 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %30984 = "torch.prims.convert_element_type"(%30982, %30983) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30984, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30985 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %30986 = "torch.aten.pow.Tensor_Scalar"(%30984, %30985) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30986, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30987 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %30988 = "torch.prim.ListConstruct"(%30987) : (!torch.int) -> !torch.list<int>
    %30989 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %30990 = "torch.constant.none"() : () -> !torch.none
    %30991 = "torch.aten.mean.dim"(%30986, %30988, %30989, %30990) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30991, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30992 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %30993 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %30994 = "torch.aten.add.Scalar"(%30991, %30992, %30993) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30994, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30995 = "torch.aten.rsqrt"(%30994) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%30995, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %30996 = "torch.aten.mul.Tensor"(%30984, %30995) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%30996, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %30997 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %30998 = "torch.prims.convert_element_type"(%30996, %30997) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30998, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %30999 = "torch.aten.mul.Tensor"(%17935, %30998) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%30999, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31000 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31001 = "torch.prims.convert_element_type"(%30999, %31000) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31001, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31002 = "torch.aten.div.Tensor"(%31001, %17937) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31002, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31003 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31004 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31005 = "torch.aten.clamp"(%31002, %31003, %31004) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31005, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31006 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31007 = "torch.prims.convert_element_type"(%31005, %31006) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31007, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31008 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31009 = "torch.aten.unsqueeze"(%17939, %31008) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %31010 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31011 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %31012 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31013 = "torch.prim.ListConstruct"(%31010, %31011, %31012) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31014 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31015 = "torch.aten.expand"(%31009, %31013, %31014) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %31016 = "torch_c.to_builtin_tensor"(%31007) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31017 = "torch_c.to_builtin_tensor"(%31015) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %31018 = "util.call"(%31016, %31017) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %31019 = "torch_c.from_builtin_tensor"(%31018) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%31019, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %31020 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31021 = "torch.prims.convert_element_type"(%31019, %31020) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31021, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31022 = "torch.aten.silu"(%31021) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31022, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31023 = "torch.aten.div.Tensor"(%31001, %17941) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31023, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31024 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31025 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31026 = "torch.aten.clamp"(%31023, %31024, %31025) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31026, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31027 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31028 = "torch.prims.convert_element_type"(%31026, %31027) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31028, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31029 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31030 = "torch.aten.unsqueeze"(%17943, %31029) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %31031 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31032 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %31033 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31034 = "torch.prim.ListConstruct"(%31031, %31032, %31033) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31035 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31036 = "torch.aten.expand"(%31030, %31034, %31035) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %31037 = "torch_c.to_builtin_tensor"(%31028) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31038 = "torch_c.to_builtin_tensor"(%31036) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %31039 = "util.call"(%31037, %31038) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %31040 = "torch_c.from_builtin_tensor"(%31039) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%31040, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %31041 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31042 = "torch.prims.convert_element_type"(%31040, %31041) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31042, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31043 = "torch.aten.mul.Tensor"(%31022, %31042) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31043, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31044 = "torch.aten.div.Tensor"(%31043, %17945) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31044, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31045 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31046 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31047 = "torch.aten.clamp"(%31044, %31045, %31046) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31047, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31048 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31049 = "torch.prims.convert_element_type"(%31047, %31048) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31049, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %31050 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31051 = "torch.aten.unsqueeze"(%17947, %31050) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %31052 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31053 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31054 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %31055 = "torch.prim.ListConstruct"(%31052, %31053, %31054) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31056 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31057 = "torch.aten.expand"(%31051, %31055, %31056) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %31058 = "torch_c.to_builtin_tensor"(%31049) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %31059 = "torch_c.to_builtin_tensor"(%31057) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %31060 = "util.call"(%31058, %31059) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %31061 = "torch_c.from_builtin_tensor"(%31060) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31061, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31062 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31063 = "torch.prims.convert_element_type"(%31061, %31062) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31063, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31064 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31065 = "torch.aten.add.Tensor"(%30982, %31063, %31064) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31065, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31066 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31067 = "torch.prims.convert_element_type"(%31065, %31066) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31067, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31068 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31069 = "torch.aten.pow.Tensor_Scalar"(%31067, %31068) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31069, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31070 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31071 = "torch.prim.ListConstruct"(%31070) : (!torch.int) -> !torch.list<int>
    %31072 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %31073 = "torch.constant.none"() : () -> !torch.none
    %31074 = "torch.aten.mean.dim"(%31069, %31071, %31072, %31073) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31074, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31075 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %31076 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31077 = "torch.aten.add.Scalar"(%31074, %31075, %31076) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31077, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31078 = "torch.aten.rsqrt"(%31077) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31078, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31079 = "torch.aten.mul.Tensor"(%31067, %31078) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31079, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31080 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31081 = "torch.prims.convert_element_type"(%31079, %31080) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31081, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31082 = "torch.aten.mul.Tensor"(%17949, %31081) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31082, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31083 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31084 = "torch.prims.convert_element_type"(%31082, %31083) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31084, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31085 = "torch.aten.div.Tensor"(%31084, %17951) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31085, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31086 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31087 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31088 = "torch.aten.clamp"(%31085, %31086, %31087) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31088, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31089 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31090 = "torch.prims.convert_element_type"(%31088, %31089) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31090, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31091 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31092 = "torch.aten.unsqueeze"(%17953, %31091) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %31093 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31094 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31095 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31096 = "torch.prim.ListConstruct"(%31093, %31094, %31095) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31097 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31098 = "torch.aten.expand"(%31092, %31096, %31097) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %31099 = "torch_c.to_builtin_tensor"(%31090) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31100 = "torch_c.to_builtin_tensor"(%31098) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %31101 = "util.call"(%31099, %31100) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %31102 = "torch_c.from_builtin_tensor"(%31101) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31102, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31103 = "torch.aten.div.Tensor"(%31102, %17955) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31103, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31104 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31105 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31106 = "torch.aten.clamp"(%31103, %31104, %31105) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31106, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31107 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31108 = "torch.prims.convert_element_type"(%31106, %31107) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31108, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31109 = "torch.aten.div.Tensor"(%31084, %17957) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31109, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31110 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31111 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31112 = "torch.aten.clamp"(%31109, %31110, %31111) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31112, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31113 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31114 = "torch.prims.convert_element_type"(%31112, %31113) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31114, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31115 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31116 = "torch.aten.unsqueeze"(%17959, %31115) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %31117 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31118 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %31119 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31120 = "torch.prim.ListConstruct"(%31117, %31118, %31119) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31121 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31122 = "torch.aten.expand"(%31116, %31120, %31121) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %31123 = "torch_c.to_builtin_tensor"(%31114) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31124 = "torch_c.to_builtin_tensor"(%31122) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %31125 = "util.call"(%31123, %31124) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %31126 = "torch_c.from_builtin_tensor"(%31125) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31126, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31127 = "torch.aten.div.Tensor"(%31126, %17961) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31127, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31128 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31129 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31130 = "torch.aten.clamp"(%31127, %31128, %31129) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31130, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31131 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31132 = "torch.prims.convert_element_type"(%31130, %31131) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31132, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %31133 = "torch.aten.div.Tensor"(%31084, %17963) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31133, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31134 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31135 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31136 = "torch.aten.clamp"(%31133, %31134, %31135) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31136, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31137 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31138 = "torch.prims.convert_element_type"(%31136, %31137) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31138, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31139 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31140 = "torch.aten.unsqueeze"(%17965, %31139) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %31141 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31142 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %31143 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31144 = "torch.prim.ListConstruct"(%31141, %31142, %31143) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31145 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31146 = "torch.aten.expand"(%31140, %31144, %31145) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %31147 = "torch_c.to_builtin_tensor"(%31138) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31148 = "torch_c.to_builtin_tensor"(%31146) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %31149 = "util.call"(%31147, %31148) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %31150 = "torch_c.from_builtin_tensor"(%31149) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31150, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31151 = "torch.aten.div.Tensor"(%31150, %17967) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31151, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31152 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31153 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31154 = "torch.aten.clamp"(%31151, %31152, %31153) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31154, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31155 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31156 = "torch.prims.convert_element_type"(%31154, %31155) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31156, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %31157 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31158 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31159 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31160 = "torch.prim.ListConstruct"(%31157, %18481, %31158, %31159) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31161 = "torch.aten.view"(%31108, %31160) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31161, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31162 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31163 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31164 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31165 = "torch.prim.ListConstruct"(%31162, %18481, %31163, %31164) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31166 = "torch.aten.view"(%31132, %31165) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31166, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31167 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31168 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31169 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31170 = "torch.prim.ListConstruct"(%31167, %18481, %31168, %31169) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31171 = "torch.aten.view"(%31156, %31170) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31171, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31172 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %31173 = "torch.constant.none"() : () -> !torch.none
    %31174 = "torch.constant.none"() : () -> !torch.none
    %31175 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31176 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31177 = "torch.aten.arange"(%31172, %31173, %31174, %31175, %31176) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %31178 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31179 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31180 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31181 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31182 = "torch.constant.none"() : () -> !torch.none
    %31183 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31184 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31185 = "torch.aten.arange.start_step"(%31178, %31179, %31180, %31181, %31182, %31183, %31184) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %31186 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31187 = "torch.prims.convert_element_type"(%31185, %31186) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %31188 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31189 = "torch.aten.div.Scalar"(%31187, %31188) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31190 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %31191 = "torch.aten.pow.Scalar"(%31190, %31189) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31192 = "torch.aten.reciprocal"(%31191) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31193 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %31194 = "torch.aten.mul.Scalar"(%31192, %31193) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %31195 = "torch.aten.reciprocal"(%31194) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31196 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %31197 = "torch.aten.mul.Scalar"(%31195, %31196) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %31198 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %31199 = "torch.aten.gt.Scalar"(%31197, %31198) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31200 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31201 = "torch.aten.div.Scalar"(%31194, %31200) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31202 = "torch.aten.where.self"(%31199, %31201, %31194) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31203 = "torch.aten.reciprocal"(%31197) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31204 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %31205 = "torch.aten.mul.Scalar"(%31203, %31204) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31206 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31207 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31208 = "torch.aten.sub.Scalar"(%31205, %31206, %31207) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %31209 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31210 = "torch.aten.div.Scalar"(%31208, %31209) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31211 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31212 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31213 = "torch.aten.rsub.Scalar"(%31210, %31211, %31212) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %31214 = "torch.aten.mul.Tensor"(%31213, %31202) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31215 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31216 = "torch.aten.div.Scalar"(%31214, %31215) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31217 = "torch.aten.mul.Tensor"(%31210, %31202) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31218 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31219 = "torch.aten.add.Tensor"(%31216, %31217, %31218) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31220 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %31221 = "torch.aten.lt.Scalar"(%31197, %31220) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31222 = "torch.aten.bitwise_not"(%31221) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31223 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %31224 = "torch.aten.gt.Scalar"(%31197, %31223) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31225 = "torch.aten.bitwise_not"(%31224) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31226 = "torch.aten.mul.Tensor"(%31222, %31225) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31227 = "torch.aten.where.self"(%31226, %31219, %31202) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31228 = "torch.prim.ListConstruct"(%31227, %31227) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %31229 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31230 = "torch.aten.cat"(%31228, %31229) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %31231 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31232 = "torch.prims.convert_element_type"(%31177, %31231) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %31233 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31234 = "torch.prims.convert_element_type"(%31230, %31233) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %31235 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %31236 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31237 = "torch.prim.ListConstruct"(%31235, %31236) : (!torch.int, !torch.int) -> !torch.list<int>
    %31238 = "torch.aten.view"(%31232, %31237) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %31239 = "torch.aten.mul.Tensor"(%31238, %31234) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31240 = "torch.aten.cos"(%31239) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31241 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31242 = "torch.prims.convert_element_type"(%31240, %31241) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %31243 = "torch.aten.sin"(%31239) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31244 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31245 = "torch.prims.convert_element_type"(%31243, %31244) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %31246 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31247 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31248 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31249 = "torch.aten.slice.Tensor"(%31242, %31246, %31247, %18481, %31248) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31249, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31251 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31252 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31253 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31254 = "torch.aten.slice.Tensor"(%31249, %31250, %31251, %31252, %31253) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31254, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31255 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31256 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31257 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31258 = "torch.aten.slice.Tensor"(%31245, %31255, %31256, %18481, %31257) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31258, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31259 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31260 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31261 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31262 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31263 = "torch.aten.slice.Tensor"(%31258, %31259, %31260, %31261, %31262) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31263, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31264 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31265 = "torch.aten.unsqueeze"(%31254, %31264) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31265, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31266 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31267 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31268 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31269 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31270 = "torch.aten.slice.Tensor"(%31265, %31266, %31267, %31268, %31269) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31270, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31271 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31272 = "torch.aten.unsqueeze"(%31270, %31271) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31272, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31273 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31274 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31275 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31276 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31277 = "torch.aten.slice.Tensor"(%31272, %31273, %31274, %31275, %31276) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31277, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31278 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31279 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31280 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31281 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31282 = "torch.prim.ListConstruct"(%31278, %31279, %31280, %31281) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31283 = "torch.aten.repeat"(%31277, %31282) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31283, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %31284 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31285 = "torch.aten.unsqueeze"(%31263, %31284) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31285, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31286 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31287 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31288 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31289 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31290 = "torch.aten.slice.Tensor"(%31285, %31286, %31287, %31288, %31289) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31290, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31291 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31292 = "torch.aten.unsqueeze"(%31290, %31291) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31292, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31293 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31294 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31295 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31296 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31297 = "torch.aten.slice.Tensor"(%31292, %31293, %31294, %31295, %31296) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31297, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31298 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31299 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31300 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31301 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31302 = "torch.prim.ListConstruct"(%31298, %31299, %31300, %31301) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31303 = "torch.aten.repeat"(%31297, %31302) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31303, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %31304 = "torch.aten.mul.Tensor"(%31161, %31283) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31304, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31305 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31306 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31307 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %31308 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31309 = "torch.aten.slice.Tensor"(%31161, %31305, %31306, %31307, %31308) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31309, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31310 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31311 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %31312 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31313 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31314 = "torch.aten.slice.Tensor"(%31161, %31310, %31311, %31312, %31313) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31314, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31315 = "torch.aten.neg"(%31314) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31315, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31316 = "torch.prim.ListConstruct"(%31315, %31309) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %31317 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31318 = "torch.aten.cat"(%31316, %31317) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31318, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31319 = "torch.aten.mul.Tensor"(%31318, %31303) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31319, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31320 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31321 = "torch.aten.add.Tensor"(%31304, %31319, %31320) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31321, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31322 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %31323 = "torch.constant.none"() : () -> !torch.none
    %31324 = "torch.constant.none"() : () -> !torch.none
    %31325 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31326 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31327 = "torch.aten.arange"(%31322, %31323, %31324, %31325, %31326) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %31328 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31329 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31330 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31331 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31332 = "torch.constant.none"() : () -> !torch.none
    %31333 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31334 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31335 = "torch.aten.arange.start_step"(%31328, %31329, %31330, %31331, %31332, %31333, %31334) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %31336 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31337 = "torch.prims.convert_element_type"(%31335, %31336) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %31338 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31339 = "torch.aten.div.Scalar"(%31337, %31338) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31340 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %31341 = "torch.aten.pow.Scalar"(%31340, %31339) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31342 = "torch.aten.reciprocal"(%31341) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31343 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %31344 = "torch.aten.mul.Scalar"(%31342, %31343) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %31345 = "torch.aten.reciprocal"(%31344) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31346 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %31347 = "torch.aten.mul.Scalar"(%31345, %31346) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %31348 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %31349 = "torch.aten.gt.Scalar"(%31347, %31348) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31350 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31351 = "torch.aten.div.Scalar"(%31344, %31350) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31352 = "torch.aten.where.self"(%31349, %31351, %31344) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31353 = "torch.aten.reciprocal"(%31347) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31354 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %31355 = "torch.aten.mul.Scalar"(%31353, %31354) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31356 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31357 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31358 = "torch.aten.sub.Scalar"(%31355, %31356, %31357) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %31359 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31360 = "torch.aten.div.Scalar"(%31358, %31359) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31361 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31362 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31363 = "torch.aten.rsub.Scalar"(%31360, %31361, %31362) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %31364 = "torch.aten.mul.Tensor"(%31363, %31352) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31365 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31366 = "torch.aten.div.Scalar"(%31364, %31365) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31367 = "torch.aten.mul.Tensor"(%31360, %31352) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31368 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31369 = "torch.aten.add.Tensor"(%31366, %31367, %31368) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31370 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %31371 = "torch.aten.lt.Scalar"(%31347, %31370) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31372 = "torch.aten.bitwise_not"(%31371) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31373 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %31374 = "torch.aten.gt.Scalar"(%31347, %31373) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31375 = "torch.aten.bitwise_not"(%31374) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31376 = "torch.aten.mul.Tensor"(%31372, %31375) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31377 = "torch.aten.where.self"(%31376, %31369, %31352) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31378 = "torch.prim.ListConstruct"(%31377, %31377) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %31379 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31380 = "torch.aten.cat"(%31378, %31379) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %31381 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31382 = "torch.prims.convert_element_type"(%31327, %31381) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %31383 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31384 = "torch.prims.convert_element_type"(%31380, %31383) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %31385 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %31386 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31387 = "torch.prim.ListConstruct"(%31385, %31386) : (!torch.int, !torch.int) -> !torch.list<int>
    %31388 = "torch.aten.view"(%31382, %31387) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %31389 = "torch.aten.mul.Tensor"(%31388, %31384) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31390 = "torch.aten.cos"(%31389) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31391 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31392 = "torch.prims.convert_element_type"(%31390, %31391) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %31393 = "torch.aten.sin"(%31389) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31394 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31395 = "torch.prims.convert_element_type"(%31393, %31394) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %31396 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31397 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31398 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31399 = "torch.aten.slice.Tensor"(%31392, %31396, %31397, %18481, %31398) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31399, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31400 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31401 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31402 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31403 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31404 = "torch.aten.slice.Tensor"(%31399, %31400, %31401, %31402, %31403) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31404, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31405 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31406 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31408 = "torch.aten.slice.Tensor"(%31395, %31405, %31406, %18481, %31407) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31408, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31409 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31410 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31411 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31412 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31413 = "torch.aten.slice.Tensor"(%31408, %31409, %31410, %31411, %31412) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31413, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31414 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31415 = "torch.aten.unsqueeze"(%31404, %31414) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31415, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31416 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31417 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31418 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31419 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31420 = "torch.aten.slice.Tensor"(%31415, %31416, %31417, %31418, %31419) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31420, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31421 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31422 = "torch.aten.unsqueeze"(%31420, %31421) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31422, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31423 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31424 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31425 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31426 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31427 = "torch.aten.slice.Tensor"(%31422, %31423, %31424, %31425, %31426) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31427, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31428 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31429 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31430 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31431 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31432 = "torch.prim.ListConstruct"(%31428, %31429, %31430, %31431) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31433 = "torch.aten.repeat"(%31427, %31432) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31433, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %31434 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31435 = "torch.aten.unsqueeze"(%31413, %31434) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31435, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31437 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31438 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31439 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31440 = "torch.aten.slice.Tensor"(%31435, %31436, %31437, %31438, %31439) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31440, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31441 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31442 = "torch.aten.unsqueeze"(%31440, %31441) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31442, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31443 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31444 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31445 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31446 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31447 = "torch.aten.slice.Tensor"(%31442, %31443, %31444, %31445, %31446) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31447, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31448 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31449 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31450 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31451 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31452 = "torch.prim.ListConstruct"(%31448, %31449, %31450, %31451) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31453 = "torch.aten.repeat"(%31447, %31452) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31453, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %31454 = "torch.aten.mul.Tensor"(%31166, %31433) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31454, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31455 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31456 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31457 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %31458 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31459 = "torch.aten.slice.Tensor"(%31166, %31455, %31456, %31457, %31458) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31459, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31460 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31461 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %31462 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31463 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31464 = "torch.aten.slice.Tensor"(%31166, %31460, %31461, %31462, %31463) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31464, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31465 = "torch.aten.neg"(%31464) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31465, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31466 = "torch.prim.ListConstruct"(%31465, %31459) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %31467 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31468 = "torch.aten.cat"(%31466, %31467) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31468, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31469 = "torch.aten.mul.Tensor"(%31468, %31453) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31469, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31470 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31471 = "torch.aten.add.Tensor"(%31454, %31469, %31470) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31471, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31472 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %31473 = "torch.aten.mul.Scalar"(%arg69, %31472) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%31473, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %31474 = "torch.constant.int"() <{value = 38 : i64}> : () -> !torch.int
    %31475 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31476 = "torch.aten.add.Scalar"(%31473, %31474, %31475) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%31476, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %31477 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31478 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31479 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31480 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31481 = "torch.prim.ListConstruct"(%31477, %18477, %31478, %31479, %31480) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31482 = "torch.aten.view"(%31471, %31481) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31482, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31483 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31484 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31485 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31486 = "torch.prim.ListConstruct"(%19011, %31483, %31484, %31485) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31487 = "torch.aten.view"(%31482, %31486) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31487, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31488 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %31489 = "torch.aten.view"(%31476, %31488) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%31489, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %31490 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31491 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31492 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31493 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31494 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31495 = "torch.prim.ListConstruct"(%18479, %31490, %31491, %31492, %31493, %31494) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31496 = "torch.aten.view"(%30898, %31495) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31496, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31497 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31498 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31499 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31500 = "torch.prim.ListConstruct"(%18993, %31497, %31498, %31499) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31501 = "torch.aten.view"(%31496, %31500) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31501, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31502 = "torch.prim.ListConstruct"(%31489) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %31503 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31504 = "torch.aten.index_put"(%31501, %31502, %31487, %31503) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31504, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31505 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31506 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31507 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31508 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31509 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31510 = "torch.prim.ListConstruct"(%18479, %31505, %31506, %31507, %31508, %31509) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31511 = "torch.aten.view"(%31504, %31510) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31511, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31512 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %31513 = "torch.prim.ListConstruct"(%18479, %31512) : (!torch.int, !torch.int) -> !torch.list<int>
    %31514 = "torch.aten.view"(%31511, %31513) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31514, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %31515 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31516 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31517 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31518 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31519 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31520 = "torch.prim.ListConstruct"(%18479, %31515, %31516, %31517, %31518, %31519) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31521 = "torch.aten.view"(%31514, %31520) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31521, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31522 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31523 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31524 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31525 = "torch.prim.ListConstruct"(%18993, %31522, %31523, %31524) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31526 = "torch.aten.view"(%31521, %31525) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31526, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31527 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31528 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31529 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31530 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31531 = "torch.prim.ListConstruct"(%31527, %18477, %31528, %31529, %31530) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31532 = "torch.aten.view"(%31171, %31531) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31532, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31533 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31534 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31535 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31536 = "torch.prim.ListConstruct"(%19011, %31533, %31534, %31535) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31537 = "torch.aten.view"(%31532, %31536) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31537, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31538 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31539 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31540 = "torch.aten.add.Scalar"(%31476, %31538, %31539) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%31540, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %31541 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %31542 = "torch.aten.view"(%31540, %31541) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%31542, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %31543 = "torch.prim.ListConstruct"(%31542) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %31544 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31545 = "torch.aten.index_put"(%31526, %31543, %31537, %31544) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31545, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31546 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31547 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31548 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31549 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31550 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31551 = "torch.prim.ListConstruct"(%18479, %31546, %31547, %31548, %31549, %31550) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31552 = "torch.aten.view"(%31545, %31551) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31552, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31553 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %31554 = "torch.prim.ListConstruct"(%18479, %31553) : (!torch.int, !torch.int) -> !torch.list<int>
    %31555 = "torch.aten.view"(%31552, %31554) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31555, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %31556 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %31557 = "torch.aten.unsqueeze"(%31471, %31556) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31557, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31558 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31559 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31560 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31561 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31562 = "torch.prim.ListConstruct"(%31558, %18481, %31559, %31560, %31561) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31563 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31564 = "torch.aten.expand"(%31557, %31562, %31563) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31564, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31565 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31566 = "torch.aten.clone"(%31564, %31565) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31566, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31567 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31568 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31569 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31570 = "torch.prim.ListConstruct"(%31567, %18481, %31568, %31569) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31571 = "torch.aten._unsafe_view"(%31566, %31570) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31571, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31572 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %31573 = "torch.aten.unsqueeze"(%31171, %31572) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31573, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31574 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31575 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31576 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31577 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31578 = "torch.prim.ListConstruct"(%31574, %18481, %31575, %31576, %31577) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31579 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31580 = "torch.aten.expand"(%31573, %31578, %31579) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31580, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31581 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31582 = "torch.aten.clone"(%31580, %31581) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31582, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31583 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31584 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31585 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31586 = "torch.prim.ListConstruct"(%31583, %18481, %31584, %31585) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31587 = "torch.aten._unsafe_view"(%31582, %31586) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31587, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31588 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31589 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31590 = "torch.aten.transpose.int"(%31321, %31588, %31589) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31590, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31591 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31592 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31593 = "torch.aten.transpose.int"(%31571, %31591, %31592) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31593, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31594 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31595 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31596 = "torch.aten.transpose.int"(%31587, %31594, %31595) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31596, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31597 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31598 = "torch.aten.squeeze.dim"(%18570, %31597) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31598, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %31599 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31600 = "torch.aten.squeeze.dim"(%31598, %31599) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31600, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %31601 = "torch_c.to_builtin_tensor"(%31590) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %31602 = "torch_c.to_builtin_tensor"(%31593) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %31603 = "torch_c.to_builtin_tensor"(%31596) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %31604 = "torch_c.to_builtin_tensor"(%31600) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %31605 = "tensor.cast"(%31604) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %31606 = "torch_c.to_builtin_tensor"(%17969) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %31607 = "util.call"(%31601, %31602, %31603, %31606, %31605) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %31608 = "torch_c.from_builtin_tensor"(%31607) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%31608, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %31609 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31610 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31611 = "torch.aten.transpose.int"(%31608, %31609, %31610) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%31611, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %31612 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31613 = "torch.aten.clone"(%31611, %31612) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%31613, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %31614 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31615 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31616 = "torch.prim.ListConstruct"(%31614, %18481, %31615) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31617 = "torch.aten._unsafe_view"(%31613, %31616) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31617, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31618 = "torch.aten.div.Tensor"(%31617, %17971) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31618, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31619 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31620 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31621 = "torch.aten.clamp"(%31618, %31619, %31620) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31621, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31622 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31623 = "torch.prims.convert_element_type"(%31621, %31622) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31623, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31624 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31625 = "torch.aten.unsqueeze"(%17973, %31624) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %31626 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31627 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31628 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31629 = "torch.prim.ListConstruct"(%31626, %31627, %31628) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31630 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31631 = "torch.aten.expand"(%31625, %31629, %31630) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %31632 = "torch_c.to_builtin_tensor"(%31623) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31633 = "torch_c.to_builtin_tensor"(%31631) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %31634 = "util.call"(%31632, %31633) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %31635 = "torch_c.from_builtin_tensor"(%31634) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31635, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31636 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31637 = "torch.prims.convert_element_type"(%31635, %31636) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31637, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31638 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31639 = "torch.aten.add.Tensor"(%31065, %31637, %31638) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31639, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31640 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31641 = "torch.prims.convert_element_type"(%31639, %31640) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31641, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31642 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31643 = "torch.aten.pow.Tensor_Scalar"(%31641, %31642) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31643, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31644 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31645 = "torch.prim.ListConstruct"(%31644) : (!torch.int) -> !torch.list<int>
    %31646 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %31647 = "torch.constant.none"() : () -> !torch.none
    %31648 = "torch.aten.mean.dim"(%31643, %31645, %31646, %31647) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31648, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31649 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %31650 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31651 = "torch.aten.add.Scalar"(%31648, %31649, %31650) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31651, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31652 = "torch.aten.rsqrt"(%31651) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31652, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31653 = "torch.aten.mul.Tensor"(%31641, %31652) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31653, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31654 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31655 = "torch.prims.convert_element_type"(%31653, %31654) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31655, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31656 = "torch.aten.mul.Tensor"(%17975, %31655) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31656, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31657 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31658 = "torch.prims.convert_element_type"(%31656, %31657) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31658, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31659 = "torch.aten.div.Tensor"(%31658, %17977) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31659, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31660 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31661 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31662 = "torch.aten.clamp"(%31659, %31660, %31661) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31662, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31663 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31664 = "torch.prims.convert_element_type"(%31662, %31663) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31664, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31665 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31666 = "torch.aten.unsqueeze"(%17979, %31665) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %31667 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31668 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %31669 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31670 = "torch.prim.ListConstruct"(%31667, %31668, %31669) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31671 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31672 = "torch.aten.expand"(%31666, %31670, %31671) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %31673 = "torch_c.to_builtin_tensor"(%31664) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31674 = "torch_c.to_builtin_tensor"(%31672) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %31675 = "util.call"(%31673, %31674) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %31676 = "torch_c.from_builtin_tensor"(%31675) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%31676, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %31677 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31678 = "torch.prims.convert_element_type"(%31676, %31677) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31678, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31679 = "torch.aten.silu"(%31678) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31679, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31680 = "torch.aten.div.Tensor"(%31658, %17981) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31680, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31681 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31682 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31683 = "torch.aten.clamp"(%31680, %31681, %31682) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31683, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31684 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31685 = "torch.prims.convert_element_type"(%31683, %31684) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31685, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31686 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31687 = "torch.aten.unsqueeze"(%17983, %31686) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %31688 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31689 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %31690 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31691 = "torch.prim.ListConstruct"(%31688, %31689, %31690) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31692 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31693 = "torch.aten.expand"(%31687, %31691, %31692) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %31694 = "torch_c.to_builtin_tensor"(%31685) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31695 = "torch_c.to_builtin_tensor"(%31693) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %31696 = "util.call"(%31694, %31695) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %31697 = "torch_c.from_builtin_tensor"(%31696) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%31697, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %31698 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31699 = "torch.prims.convert_element_type"(%31697, %31698) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31699, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31700 = "torch.aten.mul.Tensor"(%31679, %31699) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31700, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31701 = "torch.aten.div.Tensor"(%31700, %17985) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31701, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31702 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31703 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31704 = "torch.aten.clamp"(%31701, %31702, %31703) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%31704, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %31705 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31706 = "torch.prims.convert_element_type"(%31704, %31705) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31706, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %31707 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31708 = "torch.aten.unsqueeze"(%17987, %31707) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %31709 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31710 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31711 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %31712 = "torch.prim.ListConstruct"(%31709, %31710, %31711) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31713 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31714 = "torch.aten.expand"(%31708, %31712, %31713) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %31715 = "torch_c.to_builtin_tensor"(%31706) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %31716 = "torch_c.to_builtin_tensor"(%31714) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %31717 = "util.call"(%31715, %31716) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %31718 = "torch_c.from_builtin_tensor"(%31717) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31718, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31719 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31720 = "torch.prims.convert_element_type"(%31718, %31719) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31720, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31721 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31722 = "torch.aten.add.Tensor"(%31639, %31720, %31721) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31722, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31723 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31724 = "torch.prims.convert_element_type"(%31722, %31723) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31724, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31725 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31726 = "torch.aten.pow.Tensor_Scalar"(%31724, %31725) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31726, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31727 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31728 = "torch.prim.ListConstruct"(%31727) : (!torch.int) -> !torch.list<int>
    %31729 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %31730 = "torch.constant.none"() : () -> !torch.none
    %31731 = "torch.aten.mean.dim"(%31726, %31728, %31729, %31730) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31731, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31732 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %31733 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31734 = "torch.aten.add.Scalar"(%31731, %31732, %31733) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31734, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31735 = "torch.aten.rsqrt"(%31734) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%31735, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %31736 = "torch.aten.mul.Tensor"(%31724, %31735) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31736, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31737 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31738 = "torch.prims.convert_element_type"(%31736, %31737) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31738, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31739 = "torch.aten.mul.Tensor"(%17989, %31738) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31739, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31740 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31741 = "torch.prims.convert_element_type"(%31739, %31740) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31741, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31742 = "torch.aten.div.Tensor"(%31741, %17991) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31742, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31743 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31744 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31745 = "torch.aten.clamp"(%31742, %31743, %31744) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31745, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31746 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31747 = "torch.prims.convert_element_type"(%31745, %31746) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31747, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31748 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31749 = "torch.aten.unsqueeze"(%17993, %31748) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %31750 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31751 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31752 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31753 = "torch.prim.ListConstruct"(%31750, %31751, %31752) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31754 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31755 = "torch.aten.expand"(%31749, %31753, %31754) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %31756 = "torch_c.to_builtin_tensor"(%31747) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31757 = "torch_c.to_builtin_tensor"(%31755) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %31758 = "util.call"(%31756, %31757) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %31759 = "torch_c.from_builtin_tensor"(%31758) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31759, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31760 = "torch.aten.div.Tensor"(%31759, %17995) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31760, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31761 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31762 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31763 = "torch.aten.clamp"(%31760, %31761, %31762) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%31763, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %31764 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31765 = "torch.prims.convert_element_type"(%31763, %31764) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31765, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31766 = "torch.aten.div.Tensor"(%31741, %17997) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31766, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31767 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31768 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31769 = "torch.aten.clamp"(%31766, %31767, %31768) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31769, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31770 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31771 = "torch.prims.convert_element_type"(%31769, %31770) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31771, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31772 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31773 = "torch.aten.unsqueeze"(%17999, %31772) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %31774 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31775 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %31776 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31777 = "torch.prim.ListConstruct"(%31774, %31775, %31776) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31778 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31779 = "torch.aten.expand"(%31773, %31777, %31778) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %31780 = "torch_c.to_builtin_tensor"(%31771) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31781 = "torch_c.to_builtin_tensor"(%31779) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %31782 = "util.call"(%31780, %31781) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %31783 = "torch_c.from_builtin_tensor"(%31782) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31783, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31784 = "torch.aten.div.Tensor"(%31783, %18001) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31784, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31785 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31786 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31787 = "torch.aten.clamp"(%31784, %31785, %31786) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31787, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31788 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31789 = "torch.prims.convert_element_type"(%31787, %31788) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31789, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %31790 = "torch.aten.div.Tensor"(%31741, %18003) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31790, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31791 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31792 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31793 = "torch.aten.clamp"(%31790, %31791, %31792) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%31793, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %31794 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31795 = "torch.prims.convert_element_type"(%31793, %31794) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31795, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %31796 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31797 = "torch.aten.unsqueeze"(%18005, %31796) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %31798 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31799 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %31800 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %31801 = "torch.prim.ListConstruct"(%31798, %31799, %31800) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31802 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31803 = "torch.aten.expand"(%31797, %31801, %31802) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %31804 = "torch_c.to_builtin_tensor"(%31795) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %31805 = "torch_c.to_builtin_tensor"(%31803) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %31806 = "util.call"(%31804, %31805) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %31807 = "torch_c.from_builtin_tensor"(%31806) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31807, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31808 = "torch.aten.div.Tensor"(%31807, %18007) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31808, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31809 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %31810 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %31811 = "torch.aten.clamp"(%31808, %31809, %31810) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%31811, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %31812 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %31813 = "torch.prims.convert_element_type"(%31811, %31812) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31813, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %31814 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31815 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %31816 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31817 = "torch.prim.ListConstruct"(%31814, %18481, %31815, %31816) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31818 = "torch.aten.view"(%31765, %31817) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31818, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31819 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31820 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31821 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31822 = "torch.prim.ListConstruct"(%31819, %18481, %31820, %31821) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31823 = "torch.aten.view"(%31789, %31822) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31823, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31824 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31825 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31826 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31827 = "torch.prim.ListConstruct"(%31824, %18481, %31825, %31826) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31828 = "torch.aten.view"(%31813, %31827) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31828, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31829 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %31830 = "torch.constant.none"() : () -> !torch.none
    %31831 = "torch.constant.none"() : () -> !torch.none
    %31832 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31833 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31834 = "torch.aten.arange"(%31829, %31830, %31831, %31832, %31833) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %31835 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31836 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31837 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31838 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31839 = "torch.constant.none"() : () -> !torch.none
    %31840 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31841 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31842 = "torch.aten.arange.start_step"(%31835, %31836, %31837, %31838, %31839, %31840, %31841) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %31843 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31844 = "torch.prims.convert_element_type"(%31842, %31843) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %31845 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31846 = "torch.aten.div.Scalar"(%31844, %31845) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31847 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %31848 = "torch.aten.pow.Scalar"(%31847, %31846) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31849 = "torch.aten.reciprocal"(%31848) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31850 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %31851 = "torch.aten.mul.Scalar"(%31849, %31850) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %31852 = "torch.aten.reciprocal"(%31851) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31853 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %31854 = "torch.aten.mul.Scalar"(%31852, %31853) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %31855 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %31856 = "torch.aten.gt.Scalar"(%31854, %31855) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31857 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31858 = "torch.aten.div.Scalar"(%31851, %31857) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31859 = "torch.aten.where.self"(%31856, %31858, %31851) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31860 = "torch.aten.reciprocal"(%31854) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31861 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %31862 = "torch.aten.mul.Scalar"(%31860, %31861) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31863 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31864 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31865 = "torch.aten.sub.Scalar"(%31862, %31863, %31864) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %31866 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31867 = "torch.aten.div.Scalar"(%31865, %31866) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31868 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31869 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31870 = "torch.aten.rsub.Scalar"(%31867, %31868, %31869) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %31871 = "torch.aten.mul.Tensor"(%31870, %31859) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31872 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %31873 = "torch.aten.div.Scalar"(%31871, %31872) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31874 = "torch.aten.mul.Tensor"(%31867, %31859) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31875 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31876 = "torch.aten.add.Tensor"(%31873, %31874, %31875) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31877 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %31878 = "torch.aten.lt.Scalar"(%31854, %31877) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31879 = "torch.aten.bitwise_not"(%31878) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31880 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %31881 = "torch.aten.gt.Scalar"(%31854, %31880) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %31882 = "torch.aten.bitwise_not"(%31881) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31883 = "torch.aten.mul.Tensor"(%31879, %31882) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %31884 = "torch.aten.where.self"(%31883, %31876, %31859) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31885 = "torch.prim.ListConstruct"(%31884, %31884) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %31886 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31887 = "torch.aten.cat"(%31885, %31886) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %31888 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31889 = "torch.prims.convert_element_type"(%31834, %31888) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %31890 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31891 = "torch.prims.convert_element_type"(%31887, %31890) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %31892 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %31893 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31894 = "torch.prim.ListConstruct"(%31892, %31893) : (!torch.int, !torch.int) -> !torch.list<int>
    %31895 = "torch.aten.view"(%31889, %31894) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %31896 = "torch.aten.mul.Tensor"(%31895, %31891) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31897 = "torch.aten.cos"(%31896) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31898 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31899 = "torch.prims.convert_element_type"(%31897, %31898) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %31900 = "torch.aten.sin"(%31896) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %31901 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %31902 = "torch.prims.convert_element_type"(%31900, %31901) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %31903 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31904 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31905 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31906 = "torch.aten.slice.Tensor"(%31899, %31903, %31904, %18481, %31905) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31906, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31908 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31909 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31910 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31911 = "torch.aten.slice.Tensor"(%31906, %31907, %31908, %31909, %31910) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31911, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31913 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31915 = "torch.aten.slice.Tensor"(%31902, %31912, %31913, %18481, %31914) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31915, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31916 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31917 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31918 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31919 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31920 = "torch.aten.slice.Tensor"(%31915, %31916, %31917, %31918, %31919) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%31920, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %31921 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31922 = "torch.aten.unsqueeze"(%31911, %31921) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31922, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31923 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31924 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31925 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31926 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31927 = "torch.aten.slice.Tensor"(%31922, %31923, %31924, %31925, %31926) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31927, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31928 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31929 = "torch.aten.unsqueeze"(%31927, %31928) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31929, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31930 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31931 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31932 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31933 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31934 = "torch.aten.slice.Tensor"(%31929, %31930, %31931, %31932, %31933) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31934, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31935 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31936 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31937 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31938 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31939 = "torch.prim.ListConstruct"(%31935, %31936, %31937, %31938) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31940 = "torch.aten.repeat"(%31934, %31939) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31940, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %31941 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31942 = "torch.aten.unsqueeze"(%31920, %31941) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31942, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31943 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31944 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31945 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31946 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31947 = "torch.aten.slice.Tensor"(%31942, %31943, %31944, %31945, %31946) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%31947, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %31948 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31949 = "torch.aten.unsqueeze"(%31947, %31948) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31949, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31950 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31951 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31952 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31953 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31954 = "torch.aten.slice.Tensor"(%31949, %31950, %31951, %31952, %31953) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31954, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %31955 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31956 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31957 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31958 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31959 = "torch.prim.ListConstruct"(%31955, %31956, %31957, %31958) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %31960 = "torch.aten.repeat"(%31954, %31959) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%31960, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %31961 = "torch.aten.mul.Tensor"(%31818, %31940) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31961, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31962 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31963 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31964 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %31965 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31966 = "torch.aten.slice.Tensor"(%31818, %31962, %31963, %31964, %31965) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31966, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31967 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %31968 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %31969 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %31970 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31971 = "torch.aten.slice.Tensor"(%31818, %31967, %31968, %31969, %31970) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31971, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31972 = "torch.aten.neg"(%31971) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31972, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %31973 = "torch.prim.ListConstruct"(%31972, %31966) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %31974 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %31975 = "torch.aten.cat"(%31973, %31974) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31975, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31976 = "torch.aten.mul.Tensor"(%31975, %31960) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31976, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31977 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %31978 = "torch.aten.add.Tensor"(%31961, %31976, %31977) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%31978, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %31979 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %31980 = "torch.constant.none"() : () -> !torch.none
    %31981 = "torch.constant.none"() : () -> !torch.none
    %31982 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31983 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31984 = "torch.aten.arange"(%31979, %31980, %31981, %31982, %31983) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %31985 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %31986 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31987 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %31988 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %31989 = "torch.constant.none"() : () -> !torch.none
    %31990 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %31991 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %31992 = "torch.aten.arange.start_step"(%31985, %31986, %31987, %31988, %31989, %31990, %31991) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %31993 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %31994 = "torch.prims.convert_element_type"(%31992, %31993) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %31995 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %31996 = "torch.aten.div.Scalar"(%31994, %31995) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %31997 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %31998 = "torch.aten.pow.Scalar"(%31997, %31996) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %31999 = "torch.aten.reciprocal"(%31998) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32000 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %32001 = "torch.aten.mul.Scalar"(%31999, %32000) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %32002 = "torch.aten.reciprocal"(%32001) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32003 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %32004 = "torch.aten.mul.Scalar"(%32002, %32003) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %32005 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %32006 = "torch.aten.gt.Scalar"(%32004, %32005) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32007 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32008 = "torch.aten.div.Scalar"(%32001, %32007) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32009 = "torch.aten.where.self"(%32006, %32008, %32001) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32010 = "torch.aten.reciprocal"(%32004) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32011 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %32012 = "torch.aten.mul.Scalar"(%32010, %32011) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32013 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32014 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32015 = "torch.aten.sub.Scalar"(%32012, %32013, %32014) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %32016 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32017 = "torch.aten.div.Scalar"(%32015, %32016) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32018 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32019 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32020 = "torch.aten.rsub.Scalar"(%32017, %32018, %32019) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %32021 = "torch.aten.mul.Tensor"(%32020, %32009) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32022 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32023 = "torch.aten.div.Scalar"(%32021, %32022) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32024 = "torch.aten.mul.Tensor"(%32017, %32009) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32025 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32026 = "torch.aten.add.Tensor"(%32023, %32024, %32025) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32027 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %32028 = "torch.aten.lt.Scalar"(%32004, %32027) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32029 = "torch.aten.bitwise_not"(%32028) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32030 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %32031 = "torch.aten.gt.Scalar"(%32004, %32030) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32032 = "torch.aten.bitwise_not"(%32031) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32033 = "torch.aten.mul.Tensor"(%32029, %32032) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32034 = "torch.aten.where.self"(%32033, %32026, %32009) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32035 = "torch.prim.ListConstruct"(%32034, %32034) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %32036 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32037 = "torch.aten.cat"(%32035, %32036) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %32038 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32039 = "torch.prims.convert_element_type"(%31984, %32038) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %32040 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32041 = "torch.prims.convert_element_type"(%32037, %32040) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %32042 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %32043 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32044 = "torch.prim.ListConstruct"(%32042, %32043) : (!torch.int, !torch.int) -> !torch.list<int>
    %32045 = "torch.aten.view"(%32039, %32044) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %32046 = "torch.aten.mul.Tensor"(%32045, %32041) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32047 = "torch.aten.cos"(%32046) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32048 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32049 = "torch.prims.convert_element_type"(%32047, %32048) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %32050 = "torch.aten.sin"(%32046) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32051 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32052 = "torch.prims.convert_element_type"(%32050, %32051) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %32053 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32054 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32055 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32056 = "torch.aten.slice.Tensor"(%32049, %32053, %32054, %18481, %32055) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32056, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32057 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32058 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32059 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32060 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32061 = "torch.aten.slice.Tensor"(%32056, %32057, %32058, %32059, %32060) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32061, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32062 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32063 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32064 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32065 = "torch.aten.slice.Tensor"(%32052, %32062, %32063, %18481, %32064) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32065, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32066 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32067 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32068 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32069 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32070 = "torch.aten.slice.Tensor"(%32065, %32066, %32067, %32068, %32069) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32070, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32071 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32072 = "torch.aten.unsqueeze"(%32061, %32071) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32072, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32073 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32074 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32075 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32076 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32077 = "torch.aten.slice.Tensor"(%32072, %32073, %32074, %32075, %32076) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32077, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32078 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32079 = "torch.aten.unsqueeze"(%32077, %32078) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32079, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32080 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32081 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32082 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32083 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32084 = "torch.aten.slice.Tensor"(%32079, %32080, %32081, %32082, %32083) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32084, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32085 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32086 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32087 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32088 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32089 = "torch.prim.ListConstruct"(%32085, %32086, %32087, %32088) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32090 = "torch.aten.repeat"(%32084, %32089) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32090, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %32091 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32092 = "torch.aten.unsqueeze"(%32070, %32091) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32092, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32093 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32094 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32095 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32096 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32097 = "torch.aten.slice.Tensor"(%32092, %32093, %32094, %32095, %32096) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32097, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32098 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32099 = "torch.aten.unsqueeze"(%32097, %32098) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32099, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32100 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32101 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32102 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32103 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32104 = "torch.aten.slice.Tensor"(%32099, %32100, %32101, %32102, %32103) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32104, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32105 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32106 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32107 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32108 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32109 = "torch.prim.ListConstruct"(%32105, %32106, %32107, %32108) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32110 = "torch.aten.repeat"(%32104, %32109) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32110, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %32111 = "torch.aten.mul.Tensor"(%31823, %32090) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32111, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32112 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32113 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32114 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32115 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32116 = "torch.aten.slice.Tensor"(%31823, %32112, %32113, %32114, %32115) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32116, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32117 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32118 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32119 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32120 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32121 = "torch.aten.slice.Tensor"(%31823, %32117, %32118, %32119, %32120) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32121, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32122 = "torch.aten.neg"(%32121) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32122, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32123 = "torch.prim.ListConstruct"(%32122, %32116) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %32124 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32125 = "torch.aten.cat"(%32123, %32124) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32125, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32126 = "torch.aten.mul.Tensor"(%32125, %32110) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32126, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32127 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32128 = "torch.aten.add.Tensor"(%32111, %32126, %32127) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32128, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32129 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32130 = "torch.aten.mul.Scalar"(%arg69, %32129) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%32130, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %32131 = "torch.constant.int"() <{value = 40 : i64}> : () -> !torch.int
    %32132 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32133 = "torch.aten.add.Scalar"(%32130, %32131, %32132) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%32133, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %32134 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32135 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32136 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32137 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32138 = "torch.prim.ListConstruct"(%32134, %18477, %32135, %32136, %32137) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32139 = "torch.aten.view"(%32128, %32138) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32139, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32140 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32141 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32142 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32143 = "torch.prim.ListConstruct"(%19011, %32140, %32141, %32142) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32144 = "torch.aten.view"(%32139, %32143) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32144, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32145 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %32146 = "torch.aten.view"(%32133, %32145) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%32146, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %32147 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32148 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32149 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32150 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32151 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32152 = "torch.prim.ListConstruct"(%18479, %32147, %32148, %32149, %32150, %32151) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32153 = "torch.aten.view"(%31555, %32152) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32153, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32154 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32155 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32156 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32157 = "torch.prim.ListConstruct"(%18993, %32154, %32155, %32156) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32158 = "torch.aten.view"(%32153, %32157) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32158, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32159 = "torch.prim.ListConstruct"(%32146) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %32160 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32161 = "torch.aten.index_put"(%32158, %32159, %32144, %32160) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32161, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32162 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32163 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32164 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32165 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32166 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32167 = "torch.prim.ListConstruct"(%18479, %32162, %32163, %32164, %32165, %32166) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32168 = "torch.aten.view"(%32161, %32167) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32168, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32169 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %32170 = "torch.prim.ListConstruct"(%18479, %32169) : (!torch.int, !torch.int) -> !torch.list<int>
    %32171 = "torch.aten.view"(%32168, %32170) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32171, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %32172 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32173 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32174 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32175 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32176 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32177 = "torch.prim.ListConstruct"(%18479, %32172, %32173, %32174, %32175, %32176) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32178 = "torch.aten.view"(%32171, %32177) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32178, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32179 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32180 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32181 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32182 = "torch.prim.ListConstruct"(%18993, %32179, %32180, %32181) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32183 = "torch.aten.view"(%32178, %32182) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32183, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32184 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32185 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32186 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32187 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32188 = "torch.prim.ListConstruct"(%32184, %18477, %32185, %32186, %32187) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32189 = "torch.aten.view"(%31828, %32188) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32189, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32190 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32191 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32192 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32193 = "torch.prim.ListConstruct"(%19011, %32190, %32191, %32192) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32194 = "torch.aten.view"(%32189, %32193) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32194, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32195 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32196 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32197 = "torch.aten.add.Scalar"(%32133, %32195, %32196) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%32197, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %32198 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %32199 = "torch.aten.view"(%32197, %32198) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%32199, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %32200 = "torch.prim.ListConstruct"(%32199) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %32201 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32202 = "torch.aten.index_put"(%32183, %32200, %32194, %32201) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32202, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32203 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32204 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32205 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32206 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32207 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32208 = "torch.prim.ListConstruct"(%18479, %32203, %32204, %32205, %32206, %32207) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32209 = "torch.aten.view"(%32202, %32208) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32209, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32210 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %32211 = "torch.prim.ListConstruct"(%18479, %32210) : (!torch.int, !torch.int) -> !torch.list<int>
    %32212 = "torch.aten.view"(%32209, %32211) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32212, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %32213 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %32214 = "torch.aten.unsqueeze"(%32128, %32213) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32214, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32215 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32216 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32217 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32218 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32219 = "torch.prim.ListConstruct"(%32215, %18481, %32216, %32217, %32218) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32220 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32221 = "torch.aten.expand"(%32214, %32219, %32220) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32221, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32222 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32223 = "torch.aten.clone"(%32221, %32222) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32223, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32224 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32225 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32226 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32227 = "torch.prim.ListConstruct"(%32224, %18481, %32225, %32226) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32228 = "torch.aten._unsafe_view"(%32223, %32227) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32228, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32229 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %32230 = "torch.aten.unsqueeze"(%31828, %32229) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32230, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32231 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32232 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32233 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32234 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32235 = "torch.prim.ListConstruct"(%32231, %18481, %32232, %32233, %32234) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32236 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32237 = "torch.aten.expand"(%32230, %32235, %32236) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32237, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32238 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32239 = "torch.aten.clone"(%32237, %32238) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32239, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32240 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32241 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32242 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32243 = "torch.prim.ListConstruct"(%32240, %18481, %32241, %32242) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32244 = "torch.aten._unsafe_view"(%32239, %32243) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32244, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32245 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32246 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32247 = "torch.aten.transpose.int"(%31978, %32245, %32246) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32247, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32248 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32249 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32250 = "torch.aten.transpose.int"(%32228, %32248, %32249) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32250, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32251 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32252 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32253 = "torch.aten.transpose.int"(%32244, %32251, %32252) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32253, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32254 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32255 = "torch.aten.squeeze.dim"(%18570, %32254) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32255, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %32256 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32257 = "torch.aten.squeeze.dim"(%32255, %32256) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32257, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %32258 = "torch_c.to_builtin_tensor"(%32247) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %32259 = "torch_c.to_builtin_tensor"(%32250) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %32260 = "torch_c.to_builtin_tensor"(%32253) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %32261 = "torch_c.to_builtin_tensor"(%32257) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %32262 = "tensor.cast"(%32261) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %32263 = "torch_c.to_builtin_tensor"(%18009) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %32264 = "util.call"(%32258, %32259, %32260, %32263, %32262) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %32265 = "torch_c.from_builtin_tensor"(%32264) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%32265, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %32266 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32267 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32268 = "torch.aten.transpose.int"(%32265, %32266, %32267) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%32268, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %32269 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32270 = "torch.aten.clone"(%32268, %32269) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%32270, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %32271 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32272 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32273 = "torch.prim.ListConstruct"(%32271, %18481, %32272) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32274 = "torch.aten._unsafe_view"(%32270, %32273) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32274, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32275 = "torch.aten.div.Tensor"(%32274, %18011) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32275, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32276 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32277 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32278 = "torch.aten.clamp"(%32275, %32276, %32277) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32278, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32279 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32280 = "torch.prims.convert_element_type"(%32278, %32279) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32280, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32281 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32282 = "torch.aten.unsqueeze"(%18013, %32281) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %32283 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32284 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32285 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32286 = "torch.prim.ListConstruct"(%32283, %32284, %32285) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32287 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32288 = "torch.aten.expand"(%32282, %32286, %32287) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %32289 = "torch_c.to_builtin_tensor"(%32280) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32290 = "torch_c.to_builtin_tensor"(%32288) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %32291 = "util.call"(%32289, %32290) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %32292 = "torch_c.from_builtin_tensor"(%32291) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32292, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32293 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32294 = "torch.prims.convert_element_type"(%32292, %32293) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32294, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32295 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32296 = "torch.aten.add.Tensor"(%31722, %32294, %32295) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32296, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32297 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32298 = "torch.prims.convert_element_type"(%32296, %32297) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32298, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32299 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32300 = "torch.aten.pow.Tensor_Scalar"(%32298, %32299) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32300, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32301 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32302 = "torch.prim.ListConstruct"(%32301) : (!torch.int) -> !torch.list<int>
    %32303 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %32304 = "torch.constant.none"() : () -> !torch.none
    %32305 = "torch.aten.mean.dim"(%32300, %32302, %32303, %32304) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32305, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32306 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %32307 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32308 = "torch.aten.add.Scalar"(%32305, %32306, %32307) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32308, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32309 = "torch.aten.rsqrt"(%32308) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32309, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32310 = "torch.aten.mul.Tensor"(%32298, %32309) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32310, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32311 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32312 = "torch.prims.convert_element_type"(%32310, %32311) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32312, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32313 = "torch.aten.mul.Tensor"(%18015, %32312) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32313, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32314 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32315 = "torch.prims.convert_element_type"(%32313, %32314) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32315, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32316 = "torch.aten.div.Tensor"(%32315, %18017) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32316, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32317 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32318 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32319 = "torch.aten.clamp"(%32316, %32317, %32318) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32319, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32320 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32321 = "torch.prims.convert_element_type"(%32319, %32320) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32321, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32322 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32323 = "torch.aten.unsqueeze"(%18019, %32322) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %32324 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32325 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %32326 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32327 = "torch.prim.ListConstruct"(%32324, %32325, %32326) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32328 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32329 = "torch.aten.expand"(%32323, %32327, %32328) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %32330 = "torch_c.to_builtin_tensor"(%32321) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32331 = "torch_c.to_builtin_tensor"(%32329) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %32332 = "util.call"(%32330, %32331) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %32333 = "torch_c.from_builtin_tensor"(%32332) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%32333, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %32334 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32335 = "torch.prims.convert_element_type"(%32333, %32334) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32335, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32336 = "torch.aten.silu"(%32335) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32336, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32337 = "torch.aten.div.Tensor"(%32315, %18021) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32337, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32338 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32339 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32340 = "torch.aten.clamp"(%32337, %32338, %32339) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32340, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32341 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32342 = "torch.prims.convert_element_type"(%32340, %32341) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32342, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32343 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32344 = "torch.aten.unsqueeze"(%18023, %32343) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %32345 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32346 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %32347 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32348 = "torch.prim.ListConstruct"(%32345, %32346, %32347) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32349 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32350 = "torch.aten.expand"(%32344, %32348, %32349) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %32351 = "torch_c.to_builtin_tensor"(%32342) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32352 = "torch_c.to_builtin_tensor"(%32350) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %32353 = "util.call"(%32351, %32352) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %32354 = "torch_c.from_builtin_tensor"(%32353) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%32354, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %32355 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32356 = "torch.prims.convert_element_type"(%32354, %32355) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32356, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32357 = "torch.aten.mul.Tensor"(%32336, %32356) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32357, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32358 = "torch.aten.div.Tensor"(%32357, %18025) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32358, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32359 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32360 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32361 = "torch.aten.clamp"(%32358, %32359, %32360) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32361, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32362 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32363 = "torch.prims.convert_element_type"(%32361, %32362) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32363, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %32364 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32365 = "torch.aten.unsqueeze"(%18027, %32364) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %32366 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32367 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32368 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %32369 = "torch.prim.ListConstruct"(%32366, %32367, %32368) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32370 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32371 = "torch.aten.expand"(%32365, %32369, %32370) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %32372 = "torch_c.to_builtin_tensor"(%32363) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %32373 = "torch_c.to_builtin_tensor"(%32371) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %32374 = "util.call"(%32372, %32373) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %32375 = "torch_c.from_builtin_tensor"(%32374) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32375, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32376 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32377 = "torch.prims.convert_element_type"(%32375, %32376) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32377, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32378 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32379 = "torch.aten.add.Tensor"(%32296, %32377, %32378) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32379, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32380 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32381 = "torch.prims.convert_element_type"(%32379, %32380) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32381, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32382 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32383 = "torch.aten.pow.Tensor_Scalar"(%32381, %32382) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32383, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32384 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32385 = "torch.prim.ListConstruct"(%32384) : (!torch.int) -> !torch.list<int>
    %32386 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %32387 = "torch.constant.none"() : () -> !torch.none
    %32388 = "torch.aten.mean.dim"(%32383, %32385, %32386, %32387) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32388, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32389 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %32390 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32391 = "torch.aten.add.Scalar"(%32388, %32389, %32390) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32391, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32392 = "torch.aten.rsqrt"(%32391) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32392, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32393 = "torch.aten.mul.Tensor"(%32381, %32392) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32393, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32394 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32395 = "torch.prims.convert_element_type"(%32393, %32394) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32395, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32396 = "torch.aten.mul.Tensor"(%18029, %32395) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32396, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32397 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32398 = "torch.prims.convert_element_type"(%32396, %32397) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32398, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32399 = "torch.aten.div.Tensor"(%32398, %18031) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32399, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32400 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32401 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32402 = "torch.aten.clamp"(%32399, %32400, %32401) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32402, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32403 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32404 = "torch.prims.convert_element_type"(%32402, %32403) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32404, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32405 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32406 = "torch.aten.unsqueeze"(%18033, %32405) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %32407 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32408 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32409 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32410 = "torch.prim.ListConstruct"(%32407, %32408, %32409) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32411 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32412 = "torch.aten.expand"(%32406, %32410, %32411) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %32413 = "torch_c.to_builtin_tensor"(%32404) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32414 = "torch_c.to_builtin_tensor"(%32412) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %32415 = "util.call"(%32413, %32414) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %32416 = "torch_c.from_builtin_tensor"(%32415) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32416, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32417 = "torch.aten.div.Tensor"(%32416, %18035) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32417, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32418 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32419 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32420 = "torch.aten.clamp"(%32417, %32418, %32419) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32420, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32421 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32422 = "torch.prims.convert_element_type"(%32420, %32421) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32422, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32423 = "torch.aten.div.Tensor"(%32398, %18037) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32423, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32424 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32425 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32426 = "torch.aten.clamp"(%32423, %32424, %32425) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32426, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32427 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32428 = "torch.prims.convert_element_type"(%32426, %32427) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32428, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32429 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32430 = "torch.aten.unsqueeze"(%18039, %32429) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %32431 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32432 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %32433 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32434 = "torch.prim.ListConstruct"(%32431, %32432, %32433) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32435 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32436 = "torch.aten.expand"(%32430, %32434, %32435) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %32437 = "torch_c.to_builtin_tensor"(%32428) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32438 = "torch_c.to_builtin_tensor"(%32436) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %32439 = "util.call"(%32437, %32438) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %32440 = "torch_c.from_builtin_tensor"(%32439) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%32440, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %32441 = "torch.aten.div.Tensor"(%32440, %18041) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%32441, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %32442 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32443 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32444 = "torch.aten.clamp"(%32441, %32442, %32443) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%32444, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %32445 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32446 = "torch.prims.convert_element_type"(%32444, %32445) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32446, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %32447 = "torch.aten.div.Tensor"(%32398, %18043) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32447, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32448 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32449 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32450 = "torch.aten.clamp"(%32447, %32448, %32449) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32450, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32451 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32452 = "torch.prims.convert_element_type"(%32450, %32451) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32452, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32453 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32454 = "torch.aten.unsqueeze"(%18045, %32453) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %32455 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32456 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %32457 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32458 = "torch.prim.ListConstruct"(%32455, %32456, %32457) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32459 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32460 = "torch.aten.expand"(%32454, %32458, %32459) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %32461 = "torch_c.to_builtin_tensor"(%32452) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32462 = "torch_c.to_builtin_tensor"(%32460) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %32463 = "util.call"(%32461, %32462) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %32464 = "torch_c.from_builtin_tensor"(%32463) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%32464, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %32465 = "torch.aten.div.Tensor"(%32464, %18047) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%32465, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %32466 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32467 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32468 = "torch.aten.clamp"(%32465, %32466, %32467) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%32468, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %32469 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32470 = "torch.prims.convert_element_type"(%32468, %32469) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32470, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %32471 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32472 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32473 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32474 = "torch.prim.ListConstruct"(%32471, %18481, %32472, %32473) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32475 = "torch.aten.view"(%32422, %32474) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32475, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32476 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32477 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32478 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32479 = "torch.prim.ListConstruct"(%32476, %18481, %32477, %32478) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32480 = "torch.aten.view"(%32446, %32479) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32480, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32481 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32482 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32483 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32484 = "torch.prim.ListConstruct"(%32481, %18481, %32482, %32483) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32485 = "torch.aten.view"(%32470, %32484) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32485, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32486 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %32487 = "torch.constant.none"() : () -> !torch.none
    %32488 = "torch.constant.none"() : () -> !torch.none
    %32489 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %32490 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32491 = "torch.aten.arange"(%32486, %32487, %32488, %32489, %32490) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %32492 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32493 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32494 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32495 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32496 = "torch.constant.none"() : () -> !torch.none
    %32497 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %32498 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32499 = "torch.aten.arange.start_step"(%32492, %32493, %32494, %32495, %32496, %32497, %32498) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %32500 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32501 = "torch.prims.convert_element_type"(%32499, %32500) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %32502 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32503 = "torch.aten.div.Scalar"(%32501, %32502) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32504 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %32505 = "torch.aten.pow.Scalar"(%32504, %32503) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32506 = "torch.aten.reciprocal"(%32505) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32507 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %32508 = "torch.aten.mul.Scalar"(%32506, %32507) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %32509 = "torch.aten.reciprocal"(%32508) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32510 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %32511 = "torch.aten.mul.Scalar"(%32509, %32510) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %32512 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %32513 = "torch.aten.gt.Scalar"(%32511, %32512) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32514 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32515 = "torch.aten.div.Scalar"(%32508, %32514) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32516 = "torch.aten.where.self"(%32513, %32515, %32508) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32517 = "torch.aten.reciprocal"(%32511) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32518 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %32519 = "torch.aten.mul.Scalar"(%32517, %32518) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32520 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32521 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32522 = "torch.aten.sub.Scalar"(%32519, %32520, %32521) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %32523 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32524 = "torch.aten.div.Scalar"(%32522, %32523) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32525 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32526 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32527 = "torch.aten.rsub.Scalar"(%32524, %32525, %32526) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %32528 = "torch.aten.mul.Tensor"(%32527, %32516) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32529 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32530 = "torch.aten.div.Scalar"(%32528, %32529) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32531 = "torch.aten.mul.Tensor"(%32524, %32516) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32532 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32533 = "torch.aten.add.Tensor"(%32530, %32531, %32532) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32534 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %32535 = "torch.aten.lt.Scalar"(%32511, %32534) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32536 = "torch.aten.bitwise_not"(%32535) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32537 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %32538 = "torch.aten.gt.Scalar"(%32511, %32537) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32539 = "torch.aten.bitwise_not"(%32538) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32540 = "torch.aten.mul.Tensor"(%32536, %32539) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32541 = "torch.aten.where.self"(%32540, %32533, %32516) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32542 = "torch.prim.ListConstruct"(%32541, %32541) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %32543 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32544 = "torch.aten.cat"(%32542, %32543) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %32545 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32546 = "torch.prims.convert_element_type"(%32491, %32545) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %32547 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32548 = "torch.prims.convert_element_type"(%32544, %32547) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %32549 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %32550 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32551 = "torch.prim.ListConstruct"(%32549, %32550) : (!torch.int, !torch.int) -> !torch.list<int>
    %32552 = "torch.aten.view"(%32546, %32551) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %32553 = "torch.aten.mul.Tensor"(%32552, %32548) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32554 = "torch.aten.cos"(%32553) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32555 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32556 = "torch.prims.convert_element_type"(%32554, %32555) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %32557 = "torch.aten.sin"(%32553) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32558 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32559 = "torch.prims.convert_element_type"(%32557, %32558) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %32560 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32561 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32562 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32563 = "torch.aten.slice.Tensor"(%32556, %32560, %32561, %18481, %32562) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32563, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32565 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32566 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32567 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32568 = "torch.aten.slice.Tensor"(%32563, %32564, %32565, %32566, %32567) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32568, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32569 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32570 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32571 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32572 = "torch.aten.slice.Tensor"(%32559, %32569, %32570, %18481, %32571) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32572, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32573 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32574 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32575 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32576 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32577 = "torch.aten.slice.Tensor"(%32572, %32573, %32574, %32575, %32576) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32577, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32578 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32579 = "torch.aten.unsqueeze"(%32568, %32578) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32579, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32580 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32581 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32582 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32583 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32584 = "torch.aten.slice.Tensor"(%32579, %32580, %32581, %32582, %32583) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32584, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32585 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32586 = "torch.aten.unsqueeze"(%32584, %32585) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32586, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32587 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32588 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32589 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32590 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32591 = "torch.aten.slice.Tensor"(%32586, %32587, %32588, %32589, %32590) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32591, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32592 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32593 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32594 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32595 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32596 = "torch.prim.ListConstruct"(%32592, %32593, %32594, %32595) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32597 = "torch.aten.repeat"(%32591, %32596) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32597, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %32598 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32599 = "torch.aten.unsqueeze"(%32577, %32598) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32599, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32600 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32601 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32602 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32603 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32604 = "torch.aten.slice.Tensor"(%32599, %32600, %32601, %32602, %32603) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32604, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32605 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32606 = "torch.aten.unsqueeze"(%32604, %32605) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32606, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32607 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32608 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32609 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32610 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32611 = "torch.aten.slice.Tensor"(%32606, %32607, %32608, %32609, %32610) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32611, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32612 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32613 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32614 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32615 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32616 = "torch.prim.ListConstruct"(%32612, %32613, %32614, %32615) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32617 = "torch.aten.repeat"(%32611, %32616) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32617, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %32618 = "torch.aten.mul.Tensor"(%32475, %32597) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32618, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32619 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32620 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32621 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32622 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32623 = "torch.aten.slice.Tensor"(%32475, %32619, %32620, %32621, %32622) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32623, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32624 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32625 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32626 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32627 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32628 = "torch.aten.slice.Tensor"(%32475, %32624, %32625, %32626, %32627) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32628, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32629 = "torch.aten.neg"(%32628) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32629, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32630 = "torch.prim.ListConstruct"(%32629, %32623) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %32631 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32632 = "torch.aten.cat"(%32630, %32631) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32632, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32633 = "torch.aten.mul.Tensor"(%32632, %32617) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32633, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32634 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32635 = "torch.aten.add.Tensor"(%32618, %32633, %32634) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32635, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32636 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %32637 = "torch.constant.none"() : () -> !torch.none
    %32638 = "torch.constant.none"() : () -> !torch.none
    %32639 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %32640 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32641 = "torch.aten.arange"(%32636, %32637, %32638, %32639, %32640) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %32642 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32643 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32644 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32645 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32646 = "torch.constant.none"() : () -> !torch.none
    %32647 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %32648 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32649 = "torch.aten.arange.start_step"(%32642, %32643, %32644, %32645, %32646, %32647, %32648) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %32650 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32651 = "torch.prims.convert_element_type"(%32649, %32650) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %32652 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32653 = "torch.aten.div.Scalar"(%32651, %32652) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32654 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %32655 = "torch.aten.pow.Scalar"(%32654, %32653) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32656 = "torch.aten.reciprocal"(%32655) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32657 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %32658 = "torch.aten.mul.Scalar"(%32656, %32657) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %32659 = "torch.aten.reciprocal"(%32658) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32660 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %32661 = "torch.aten.mul.Scalar"(%32659, %32660) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %32662 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %32663 = "torch.aten.gt.Scalar"(%32661, %32662) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32664 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32665 = "torch.aten.div.Scalar"(%32658, %32664) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32666 = "torch.aten.where.self"(%32663, %32665, %32658) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32667 = "torch.aten.reciprocal"(%32661) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32668 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %32669 = "torch.aten.mul.Scalar"(%32667, %32668) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32670 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32671 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32672 = "torch.aten.sub.Scalar"(%32669, %32670, %32671) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %32673 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32674 = "torch.aten.div.Scalar"(%32672, %32673) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32675 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32676 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32677 = "torch.aten.rsub.Scalar"(%32674, %32675, %32676) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %32678 = "torch.aten.mul.Tensor"(%32677, %32666) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32679 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32680 = "torch.aten.div.Scalar"(%32678, %32679) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32681 = "torch.aten.mul.Tensor"(%32674, %32666) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32682 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32683 = "torch.aten.add.Tensor"(%32680, %32681, %32682) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %32684 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %32685 = "torch.aten.lt.Scalar"(%32661, %32684) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32686 = "torch.aten.bitwise_not"(%32685) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32687 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %32688 = "torch.aten.gt.Scalar"(%32661, %32687) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %32689 = "torch.aten.bitwise_not"(%32688) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32690 = "torch.aten.mul.Tensor"(%32686, %32689) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %32691 = "torch.aten.where.self"(%32690, %32683, %32666) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %32692 = "torch.prim.ListConstruct"(%32691, %32691) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %32693 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32694 = "torch.aten.cat"(%32692, %32693) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %32695 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32696 = "torch.prims.convert_element_type"(%32641, %32695) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %32697 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32698 = "torch.prims.convert_element_type"(%32694, %32697) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %32699 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %32700 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32701 = "torch.prim.ListConstruct"(%32699, %32700) : (!torch.int, !torch.int) -> !torch.list<int>
    %32702 = "torch.aten.view"(%32696, %32701) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %32703 = "torch.aten.mul.Tensor"(%32702, %32698) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32704 = "torch.aten.cos"(%32703) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32705 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32706 = "torch.prims.convert_element_type"(%32704, %32705) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %32707 = "torch.aten.sin"(%32703) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %32708 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32709 = "torch.prims.convert_element_type"(%32707, %32708) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %32710 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32711 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32712 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32713 = "torch.aten.slice.Tensor"(%32706, %32710, %32711, %18481, %32712) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32713, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32714 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32715 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32716 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32717 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32718 = "torch.aten.slice.Tensor"(%32713, %32714, %32715, %32716, %32717) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32718, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32719 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32720 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32721 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32722 = "torch.aten.slice.Tensor"(%32709, %32719, %32720, %18481, %32721) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32722, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32723 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32724 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32725 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32726 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32727 = "torch.aten.slice.Tensor"(%32722, %32723, %32724, %32725, %32726) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%32727, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %32728 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32729 = "torch.aten.unsqueeze"(%32718, %32728) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32729, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32730 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32731 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32732 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32733 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32734 = "torch.aten.slice.Tensor"(%32729, %32730, %32731, %32732, %32733) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32734, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32735 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32736 = "torch.aten.unsqueeze"(%32734, %32735) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32736, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32737 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32738 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32739 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32740 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32741 = "torch.aten.slice.Tensor"(%32736, %32737, %32738, %32739, %32740) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32741, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32742 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32743 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32744 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32745 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32746 = "torch.prim.ListConstruct"(%32742, %32743, %32744, %32745) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32747 = "torch.aten.repeat"(%32741, %32746) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32747, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %32748 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32749 = "torch.aten.unsqueeze"(%32727, %32748) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32749, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32751 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32752 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32753 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32754 = "torch.aten.slice.Tensor"(%32749, %32750, %32751, %32752, %32753) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%32754, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %32755 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32756 = "torch.aten.unsqueeze"(%32754, %32755) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32756, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32757 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32758 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32759 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32761 = "torch.aten.slice.Tensor"(%32756, %32757, %32758, %32759, %32760) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32761, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %32762 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32763 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32764 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32765 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32766 = "torch.prim.ListConstruct"(%32762, %32763, %32764, %32765) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32767 = "torch.aten.repeat"(%32761, %32766) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%32767, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %32768 = "torch.aten.mul.Tensor"(%32480, %32747) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32768, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32769 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32770 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32771 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32772 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32773 = "torch.aten.slice.Tensor"(%32480, %32769, %32770, %32771, %32772) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32773, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32774 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %32775 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32776 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %32777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32778 = "torch.aten.slice.Tensor"(%32480, %32774, %32775, %32776, %32777) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32778, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32779 = "torch.aten.neg"(%32778) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32779, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %32780 = "torch.prim.ListConstruct"(%32779, %32773) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %32781 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32782 = "torch.aten.cat"(%32780, %32781) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32782, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32783 = "torch.aten.mul.Tensor"(%32782, %32767) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32783, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32784 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32785 = "torch.aten.add.Tensor"(%32768, %32783, %32784) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32785, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32786 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %32787 = "torch.aten.mul.Scalar"(%arg69, %32786) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%32787, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %32788 = "torch.constant.int"() <{value = 42 : i64}> : () -> !torch.int
    %32789 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32790 = "torch.aten.add.Scalar"(%32787, %32788, %32789) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%32790, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %32791 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32792 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32793 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32794 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32795 = "torch.prim.ListConstruct"(%32791, %18477, %32792, %32793, %32794) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32796 = "torch.aten.view"(%32785, %32795) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32796, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32797 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32798 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32799 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32800 = "torch.prim.ListConstruct"(%19011, %32797, %32798, %32799) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32801 = "torch.aten.view"(%32796, %32800) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32801, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32802 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %32803 = "torch.aten.view"(%32790, %32802) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%32803, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %32804 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32805 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32806 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32807 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32808 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32809 = "torch.prim.ListConstruct"(%18479, %32804, %32805, %32806, %32807, %32808) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32810 = "torch.aten.view"(%32212, %32809) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32810, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32811 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32812 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32813 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32814 = "torch.prim.ListConstruct"(%18993, %32811, %32812, %32813) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32815 = "torch.aten.view"(%32810, %32814) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32815, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32816 = "torch.prim.ListConstruct"(%32803) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %32817 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32818 = "torch.aten.index_put"(%32815, %32816, %32801, %32817) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32818, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32819 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32820 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32821 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32822 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32823 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32824 = "torch.prim.ListConstruct"(%18479, %32819, %32820, %32821, %32822, %32823) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32825 = "torch.aten.view"(%32818, %32824) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32825, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32826 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %32827 = "torch.prim.ListConstruct"(%18479, %32826) : (!torch.int, !torch.int) -> !torch.list<int>
    %32828 = "torch.aten.view"(%32825, %32827) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32828, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %32829 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32830 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32831 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32832 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32833 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32834 = "torch.prim.ListConstruct"(%18479, %32829, %32830, %32831, %32832, %32833) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32835 = "torch.aten.view"(%32828, %32834) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32835, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32836 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32837 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32838 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32839 = "torch.prim.ListConstruct"(%18993, %32836, %32837, %32838) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32840 = "torch.aten.view"(%32835, %32839) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32840, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32841 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32842 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32843 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32844 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32845 = "torch.prim.ListConstruct"(%32841, %18477, %32842, %32843, %32844) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32846 = "torch.aten.view"(%32485, %32845) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32846, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32847 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32848 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32849 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32850 = "torch.prim.ListConstruct"(%19011, %32847, %32848, %32849) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32851 = "torch.aten.view"(%32846, %32850) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32851, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32852 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32853 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32854 = "torch.aten.add.Scalar"(%32790, %32852, %32853) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%32854, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %32855 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %32856 = "torch.aten.view"(%32854, %32855) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%32856, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %32857 = "torch.prim.ListConstruct"(%32856) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %32858 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32859 = "torch.aten.index_put"(%32840, %32857, %32851, %32858) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32859, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32860 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32861 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32862 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32863 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32864 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32865 = "torch.prim.ListConstruct"(%18479, %32860, %32861, %32862, %32863, %32864) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32866 = "torch.aten.view"(%32859, %32865) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32866, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32867 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %32868 = "torch.prim.ListConstruct"(%18479, %32867) : (!torch.int, !torch.int) -> !torch.list<int>
    %32869 = "torch.aten.view"(%32866, %32868) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32869, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %32870 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %32871 = "torch.aten.unsqueeze"(%32785, %32870) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32871, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32872 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32873 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32874 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32875 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32876 = "torch.prim.ListConstruct"(%32872, %18481, %32873, %32874, %32875) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32877 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32878 = "torch.aten.expand"(%32871, %32876, %32877) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32878, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32879 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32880 = "torch.aten.clone"(%32878, %32879) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32880, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32881 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32882 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32883 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32884 = "torch.prim.ListConstruct"(%32881, %18481, %32882, %32883) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32885 = "torch.aten._unsafe_view"(%32880, %32884) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32885, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32886 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %32887 = "torch.aten.unsqueeze"(%32485, %32886) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32887, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32888 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32889 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %32890 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32891 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32892 = "torch.prim.ListConstruct"(%32888, %18481, %32889, %32890, %32891) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32893 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32894 = "torch.aten.expand"(%32887, %32892, %32893) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32894, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32895 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32896 = "torch.aten.clone"(%32894, %32895) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32896, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32897 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32898 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %32899 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %32900 = "torch.prim.ListConstruct"(%32897, %18481, %32898, %32899) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32901 = "torch.aten._unsafe_view"(%32896, %32900) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32901, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32902 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32903 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32904 = "torch.aten.transpose.int"(%32635, %32902, %32903) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32904, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32905 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32906 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32907 = "torch.aten.transpose.int"(%32885, %32905, %32906) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32907, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32908 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32909 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32910 = "torch.aten.transpose.int"(%32901, %32908, %32909) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32910, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %32911 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32912 = "torch.aten.squeeze.dim"(%18570, %32911) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32912, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %32913 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32914 = "torch.aten.squeeze.dim"(%32912, %32913) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32914, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %32915 = "torch_c.to_builtin_tensor"(%32904) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %32916 = "torch_c.to_builtin_tensor"(%32907) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %32917 = "torch_c.to_builtin_tensor"(%32910) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %32918 = "torch_c.to_builtin_tensor"(%32914) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %32919 = "tensor.cast"(%32918) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %32920 = "torch_c.to_builtin_tensor"(%18049) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %32921 = "util.call"(%32915, %32916, %32917, %32920, %32919) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %32922 = "torch_c.from_builtin_tensor"(%32921) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%32922, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %32923 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32924 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32925 = "torch.aten.transpose.int"(%32922, %32923, %32924) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%32925, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %32926 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32927 = "torch.aten.clone"(%32925, %32926) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%32927, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %32928 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32929 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32930 = "torch.prim.ListConstruct"(%32928, %18481, %32929) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32931 = "torch.aten._unsafe_view"(%32927, %32930) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32931, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32932 = "torch.aten.div.Tensor"(%32931, %18051) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32932, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32933 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32934 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32935 = "torch.aten.clamp"(%32932, %32933, %32934) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32935, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32936 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32937 = "torch.prims.convert_element_type"(%32935, %32936) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32937, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32938 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32939 = "torch.aten.unsqueeze"(%18053, %32938) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %32940 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32941 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32942 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32943 = "torch.prim.ListConstruct"(%32940, %32941, %32942) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32944 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32945 = "torch.aten.expand"(%32939, %32943, %32944) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %32946 = "torch_c.to_builtin_tensor"(%32937) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32947 = "torch_c.to_builtin_tensor"(%32945) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %32948 = "util.call"(%32946, %32947) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %32949 = "torch_c.from_builtin_tensor"(%32948) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32949, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32950 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32951 = "torch.prims.convert_element_type"(%32949, %32950) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32951, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32952 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32953 = "torch.aten.add.Tensor"(%32379, %32951, %32952) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32953, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32954 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %32955 = "torch.prims.convert_element_type"(%32953, %32954) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32955, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32956 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %32957 = "torch.aten.pow.Tensor_Scalar"(%32955, %32956) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32957, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32958 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %32959 = "torch.prim.ListConstruct"(%32958) : (!torch.int) -> !torch.list<int>
    %32960 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %32961 = "torch.constant.none"() : () -> !torch.none
    %32962 = "torch.aten.mean.dim"(%32957, %32959, %32960, %32961) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32962, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32963 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %32964 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %32965 = "torch.aten.add.Scalar"(%32962, %32963, %32964) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32965, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32966 = "torch.aten.rsqrt"(%32965) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%32966, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %32967 = "torch.aten.mul.Tensor"(%32955, %32966) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%32967, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %32968 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32969 = "torch.prims.convert_element_type"(%32967, %32968) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32969, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32970 = "torch.aten.mul.Tensor"(%18055, %32969) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32970, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32971 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32972 = "torch.prims.convert_element_type"(%32970, %32971) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32972, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32973 = "torch.aten.div.Tensor"(%32972, %18057) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32973, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32974 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32975 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32976 = "torch.aten.clamp"(%32973, %32974, %32975) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32976, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32977 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32978 = "torch.prims.convert_element_type"(%32976, %32977) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32978, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %32979 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %32980 = "torch.aten.unsqueeze"(%18059, %32979) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %32981 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %32982 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %32983 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %32984 = "torch.prim.ListConstruct"(%32981, %32982, %32983) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %32985 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %32986 = "torch.aten.expand"(%32980, %32984, %32985) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %32987 = "torch_c.to_builtin_tensor"(%32978) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %32988 = "torch_c.to_builtin_tensor"(%32986) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %32989 = "util.call"(%32987, %32988) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %32990 = "torch_c.from_builtin_tensor"(%32989) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%32990, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %32991 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %32992 = "torch.prims.convert_element_type"(%32990, %32991) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32992, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32993 = "torch.aten.silu"(%32992) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%32993, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %32994 = "torch.aten.div.Tensor"(%32972, %18061) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32994, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32995 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %32996 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %32997 = "torch.aten.clamp"(%32994, %32995, %32996) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%32997, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %32998 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %32999 = "torch.prims.convert_element_type"(%32997, %32998) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%32999, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33000 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33001 = "torch.aten.unsqueeze"(%18063, %33000) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %33002 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33003 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %33004 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33005 = "torch.prim.ListConstruct"(%33002, %33003, %33004) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33006 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33007 = "torch.aten.expand"(%33001, %33005, %33006) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %33008 = "torch_c.to_builtin_tensor"(%32999) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33009 = "torch_c.to_builtin_tensor"(%33007) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %33010 = "util.call"(%33008, %33009) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %33011 = "torch_c.from_builtin_tensor"(%33010) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%33011, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %33012 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33013 = "torch.prims.convert_element_type"(%33011, %33012) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33013, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33014 = "torch.aten.mul.Tensor"(%32993, %33013) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33014, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33015 = "torch.aten.div.Tensor"(%33014, %18065) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33015, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33016 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33017 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33018 = "torch.aten.clamp"(%33015, %33016, %33017) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33018, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33019 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33020 = "torch.prims.convert_element_type"(%33018, %33019) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33020, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %33021 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33022 = "torch.aten.unsqueeze"(%18067, %33021) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %33023 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33024 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33025 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %33026 = "torch.prim.ListConstruct"(%33023, %33024, %33025) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33027 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33028 = "torch.aten.expand"(%33022, %33026, %33027) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %33029 = "torch_c.to_builtin_tensor"(%33020) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %33030 = "torch_c.to_builtin_tensor"(%33028) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %33031 = "util.call"(%33029, %33030) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %33032 = "torch_c.from_builtin_tensor"(%33031) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33032, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33033 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33034 = "torch.prims.convert_element_type"(%33032, %33033) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33034, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33035 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33036 = "torch.aten.add.Tensor"(%32953, %33034, %33035) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33036, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33037 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33038 = "torch.prims.convert_element_type"(%33036, %33037) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33038, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33039 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33040 = "torch.aten.pow.Tensor_Scalar"(%33038, %33039) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33040, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33041 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33042 = "torch.prim.ListConstruct"(%33041) : (!torch.int) -> !torch.list<int>
    %33043 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %33044 = "torch.constant.none"() : () -> !torch.none
    %33045 = "torch.aten.mean.dim"(%33040, %33042, %33043, %33044) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33045, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33046 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %33047 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33048 = "torch.aten.add.Scalar"(%33045, %33046, %33047) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33048, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33049 = "torch.aten.rsqrt"(%33048) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33049, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33050 = "torch.aten.mul.Tensor"(%33038, %33049) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33050, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33051 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33052 = "torch.prims.convert_element_type"(%33050, %33051) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33052, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33053 = "torch.aten.mul.Tensor"(%18069, %33052) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33053, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33054 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33055 = "torch.prims.convert_element_type"(%33053, %33054) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33055, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33056 = "torch.aten.div.Tensor"(%33055, %18071) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33056, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33057 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33058 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33059 = "torch.aten.clamp"(%33056, %33057, %33058) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33059, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33060 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33061 = "torch.prims.convert_element_type"(%33059, %33060) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33061, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33062 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33063 = "torch.aten.unsqueeze"(%18073, %33062) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %33064 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33065 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33066 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33067 = "torch.prim.ListConstruct"(%33064, %33065, %33066) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33068 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33069 = "torch.aten.expand"(%33063, %33067, %33068) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %33070 = "torch_c.to_builtin_tensor"(%33061) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33071 = "torch_c.to_builtin_tensor"(%33069) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %33072 = "util.call"(%33070, %33071) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %33073 = "torch_c.from_builtin_tensor"(%33072) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33073, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33074 = "torch.aten.div.Tensor"(%33073, %18075) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33074, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33075 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33076 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33077 = "torch.aten.clamp"(%33074, %33075, %33076) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33077, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33078 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33079 = "torch.prims.convert_element_type"(%33077, %33078) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33079, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33080 = "torch.aten.div.Tensor"(%33055, %18077) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33080, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33081 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33082 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33083 = "torch.aten.clamp"(%33080, %33081, %33082) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33083, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33084 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33085 = "torch.prims.convert_element_type"(%33083, %33084) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33085, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33086 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33087 = "torch.aten.unsqueeze"(%18079, %33086) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %33088 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33089 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %33090 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33091 = "torch.prim.ListConstruct"(%33088, %33089, %33090) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33092 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33093 = "torch.aten.expand"(%33087, %33091, %33092) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %33094 = "torch_c.to_builtin_tensor"(%33085) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33095 = "torch_c.to_builtin_tensor"(%33093) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %33096 = "util.call"(%33094, %33095) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %33097 = "torch_c.from_builtin_tensor"(%33096) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33097, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33098 = "torch.aten.div.Tensor"(%33097, %18081) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33098, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33099 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33100 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33101 = "torch.aten.clamp"(%33098, %33099, %33100) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33101, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33102 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33103 = "torch.prims.convert_element_type"(%33101, %33102) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33103, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %33104 = "torch.aten.div.Tensor"(%33055, %18083) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33104, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33105 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33106 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33107 = "torch.aten.clamp"(%33104, %33105, %33106) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33107, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33108 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33109 = "torch.prims.convert_element_type"(%33107, %33108) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33109, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33110 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33111 = "torch.aten.unsqueeze"(%18085, %33110) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %33112 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33113 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %33114 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33115 = "torch.prim.ListConstruct"(%33112, %33113, %33114) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33116 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33117 = "torch.aten.expand"(%33111, %33115, %33116) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %33118 = "torch_c.to_builtin_tensor"(%33109) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33119 = "torch_c.to_builtin_tensor"(%33117) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %33120 = "util.call"(%33118, %33119) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %33121 = "torch_c.from_builtin_tensor"(%33120) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33121, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33122 = "torch.aten.div.Tensor"(%33121, %18087) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33122, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33123 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33124 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33125 = "torch.aten.clamp"(%33122, %33123, %33124) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33125, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33126 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33127 = "torch.prims.convert_element_type"(%33125, %33126) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33127, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %33128 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33129 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33130 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33131 = "torch.prim.ListConstruct"(%33128, %18481, %33129, %33130) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33132 = "torch.aten.view"(%33079, %33131) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33132, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33133 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33134 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33135 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33136 = "torch.prim.ListConstruct"(%33133, %18481, %33134, %33135) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33137 = "torch.aten.view"(%33103, %33136) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33137, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33138 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33139 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33140 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33141 = "torch.prim.ListConstruct"(%33138, %18481, %33139, %33140) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33142 = "torch.aten.view"(%33127, %33141) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33142, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33143 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %33144 = "torch.constant.none"() : () -> !torch.none
    %33145 = "torch.constant.none"() : () -> !torch.none
    %33146 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33147 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33148 = "torch.aten.arange"(%33143, %33144, %33145, %33146, %33147) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %33149 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33150 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33151 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33152 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33153 = "torch.constant.none"() : () -> !torch.none
    %33154 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33155 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33156 = "torch.aten.arange.start_step"(%33149, %33150, %33151, %33152, %33153, %33154, %33155) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %33157 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33158 = "torch.prims.convert_element_type"(%33156, %33157) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %33159 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33160 = "torch.aten.div.Scalar"(%33158, %33159) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33161 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %33162 = "torch.aten.pow.Scalar"(%33161, %33160) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33163 = "torch.aten.reciprocal"(%33162) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33164 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %33165 = "torch.aten.mul.Scalar"(%33163, %33164) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33166 = "torch.aten.reciprocal"(%33165) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33167 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %33168 = "torch.aten.mul.Scalar"(%33166, %33167) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33169 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %33170 = "torch.aten.gt.Scalar"(%33168, %33169) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33171 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33172 = "torch.aten.div.Scalar"(%33165, %33171) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33173 = "torch.aten.where.self"(%33170, %33172, %33165) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33174 = "torch.aten.reciprocal"(%33168) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33175 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %33176 = "torch.aten.mul.Scalar"(%33174, %33175) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33177 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33178 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33179 = "torch.aten.sub.Scalar"(%33176, %33177, %33178) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33180 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33181 = "torch.aten.div.Scalar"(%33179, %33180) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33183 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33184 = "torch.aten.rsub.Scalar"(%33181, %33182, %33183) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33185 = "torch.aten.mul.Tensor"(%33184, %33173) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33186 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33187 = "torch.aten.div.Scalar"(%33185, %33186) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33188 = "torch.aten.mul.Tensor"(%33181, %33173) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33189 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33190 = "torch.aten.add.Tensor"(%33187, %33188, %33189) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33191 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %33192 = "torch.aten.lt.Scalar"(%33168, %33191) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33193 = "torch.aten.bitwise_not"(%33192) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33194 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %33195 = "torch.aten.gt.Scalar"(%33168, %33194) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33196 = "torch.aten.bitwise_not"(%33195) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33197 = "torch.aten.mul.Tensor"(%33193, %33196) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33198 = "torch.aten.where.self"(%33197, %33190, %33173) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33199 = "torch.prim.ListConstruct"(%33198, %33198) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %33200 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33201 = "torch.aten.cat"(%33199, %33200) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %33202 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33203 = "torch.prims.convert_element_type"(%33148, %33202) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %33204 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33205 = "torch.prims.convert_element_type"(%33201, %33204) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %33206 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %33207 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33208 = "torch.prim.ListConstruct"(%33206, %33207) : (!torch.int, !torch.int) -> !torch.list<int>
    %33209 = "torch.aten.view"(%33203, %33208) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %33210 = "torch.aten.mul.Tensor"(%33209, %33205) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33211 = "torch.aten.cos"(%33210) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33212 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33213 = "torch.prims.convert_element_type"(%33211, %33212) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %33214 = "torch.aten.sin"(%33210) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33215 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33216 = "torch.prims.convert_element_type"(%33214, %33215) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %33217 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33218 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33219 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33220 = "torch.aten.slice.Tensor"(%33213, %33217, %33218, %18481, %33219) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33220, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33222 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33223 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33224 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33225 = "torch.aten.slice.Tensor"(%33220, %33221, %33222, %33223, %33224) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33225, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33226 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33227 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33228 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33229 = "torch.aten.slice.Tensor"(%33216, %33226, %33227, %18481, %33228) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33229, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33230 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33231 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33232 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33233 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33234 = "torch.aten.slice.Tensor"(%33229, %33230, %33231, %33232, %33233) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33234, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33235 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33236 = "torch.aten.unsqueeze"(%33225, %33235) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33236, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33237 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33238 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33239 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33240 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33241 = "torch.aten.slice.Tensor"(%33236, %33237, %33238, %33239, %33240) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33241, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33242 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33243 = "torch.aten.unsqueeze"(%33241, %33242) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33243, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33244 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33245 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33246 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33247 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33248 = "torch.aten.slice.Tensor"(%33243, %33244, %33245, %33246, %33247) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33248, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33249 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33251 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33252 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33253 = "torch.prim.ListConstruct"(%33249, %33250, %33251, %33252) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33254 = "torch.aten.repeat"(%33248, %33253) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33254, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %33255 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33256 = "torch.aten.unsqueeze"(%33234, %33255) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33256, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33257 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33258 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33259 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33260 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33261 = "torch.aten.slice.Tensor"(%33256, %33257, %33258, %33259, %33260) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33261, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33262 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33263 = "torch.aten.unsqueeze"(%33261, %33262) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33263, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33264 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33266 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33267 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33268 = "torch.aten.slice.Tensor"(%33263, %33264, %33265, %33266, %33267) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33268, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33269 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33271 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33272 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33273 = "torch.prim.ListConstruct"(%33269, %33270, %33271, %33272) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33274 = "torch.aten.repeat"(%33268, %33273) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33274, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %33275 = "torch.aten.mul.Tensor"(%33132, %33254) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33275, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33276 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33277 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33278 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %33279 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33280 = "torch.aten.slice.Tensor"(%33132, %33276, %33277, %33278, %33279) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33280, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33281 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33282 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %33283 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33284 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33285 = "torch.aten.slice.Tensor"(%33132, %33281, %33282, %33283, %33284) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33285, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33286 = "torch.aten.neg"(%33285) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33286, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33287 = "torch.prim.ListConstruct"(%33286, %33280) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %33288 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33289 = "torch.aten.cat"(%33287, %33288) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33289, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33290 = "torch.aten.mul.Tensor"(%33289, %33274) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33290, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33291 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33292 = "torch.aten.add.Tensor"(%33275, %33290, %33291) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33292, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33293 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %33294 = "torch.constant.none"() : () -> !torch.none
    %33295 = "torch.constant.none"() : () -> !torch.none
    %33296 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33297 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33298 = "torch.aten.arange"(%33293, %33294, %33295, %33296, %33297) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %33299 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33300 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33301 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33302 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33303 = "torch.constant.none"() : () -> !torch.none
    %33304 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33305 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33306 = "torch.aten.arange.start_step"(%33299, %33300, %33301, %33302, %33303, %33304, %33305) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %33307 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33308 = "torch.prims.convert_element_type"(%33306, %33307) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %33309 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33310 = "torch.aten.div.Scalar"(%33308, %33309) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33311 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %33312 = "torch.aten.pow.Scalar"(%33311, %33310) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33313 = "torch.aten.reciprocal"(%33312) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33314 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %33315 = "torch.aten.mul.Scalar"(%33313, %33314) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33316 = "torch.aten.reciprocal"(%33315) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33317 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %33318 = "torch.aten.mul.Scalar"(%33316, %33317) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33319 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %33320 = "torch.aten.gt.Scalar"(%33318, %33319) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33321 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33322 = "torch.aten.div.Scalar"(%33315, %33321) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33323 = "torch.aten.where.self"(%33320, %33322, %33315) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33324 = "torch.aten.reciprocal"(%33318) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33325 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %33326 = "torch.aten.mul.Scalar"(%33324, %33325) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33327 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33328 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33329 = "torch.aten.sub.Scalar"(%33326, %33327, %33328) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33330 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33331 = "torch.aten.div.Scalar"(%33329, %33330) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33332 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33333 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33334 = "torch.aten.rsub.Scalar"(%33331, %33332, %33333) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33335 = "torch.aten.mul.Tensor"(%33334, %33323) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33336 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33337 = "torch.aten.div.Scalar"(%33335, %33336) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33338 = "torch.aten.mul.Tensor"(%33331, %33323) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33339 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33340 = "torch.aten.add.Tensor"(%33337, %33338, %33339) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33341 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %33342 = "torch.aten.lt.Scalar"(%33318, %33341) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33343 = "torch.aten.bitwise_not"(%33342) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33344 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %33345 = "torch.aten.gt.Scalar"(%33318, %33344) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33346 = "torch.aten.bitwise_not"(%33345) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33347 = "torch.aten.mul.Tensor"(%33343, %33346) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33348 = "torch.aten.where.self"(%33347, %33340, %33323) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33349 = "torch.prim.ListConstruct"(%33348, %33348) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %33350 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33351 = "torch.aten.cat"(%33349, %33350) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %33352 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33353 = "torch.prims.convert_element_type"(%33298, %33352) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %33354 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33355 = "torch.prims.convert_element_type"(%33351, %33354) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %33356 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %33357 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33358 = "torch.prim.ListConstruct"(%33356, %33357) : (!torch.int, !torch.int) -> !torch.list<int>
    %33359 = "torch.aten.view"(%33353, %33358) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %33360 = "torch.aten.mul.Tensor"(%33359, %33355) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33361 = "torch.aten.cos"(%33360) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33362 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33363 = "torch.prims.convert_element_type"(%33361, %33362) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %33364 = "torch.aten.sin"(%33360) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33365 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33366 = "torch.prims.convert_element_type"(%33364, %33365) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %33367 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33368 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33369 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33370 = "torch.aten.slice.Tensor"(%33363, %33367, %33368, %18481, %33369) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33370, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33371 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33372 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33373 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33374 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33375 = "torch.aten.slice.Tensor"(%33370, %33371, %33372, %33373, %33374) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33375, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33376 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33377 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33378 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33379 = "torch.aten.slice.Tensor"(%33366, %33376, %33377, %18481, %33378) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33379, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33380 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33381 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33382 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33383 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33384 = "torch.aten.slice.Tensor"(%33379, %33380, %33381, %33382, %33383) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33384, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33385 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33386 = "torch.aten.unsqueeze"(%33375, %33385) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33386, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33387 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33388 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33389 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33390 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33391 = "torch.aten.slice.Tensor"(%33386, %33387, %33388, %33389, %33390) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33391, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33392 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33393 = "torch.aten.unsqueeze"(%33391, %33392) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33393, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33394 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33395 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33396 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33397 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33398 = "torch.aten.slice.Tensor"(%33393, %33394, %33395, %33396, %33397) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33398, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33399 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33400 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33401 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33402 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33403 = "torch.prim.ListConstruct"(%33399, %33400, %33401, %33402) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33404 = "torch.aten.repeat"(%33398, %33403) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33404, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %33405 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33406 = "torch.aten.unsqueeze"(%33384, %33405) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33406, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33408 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33409 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33410 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33411 = "torch.aten.slice.Tensor"(%33406, %33407, %33408, %33409, %33410) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33411, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33412 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33413 = "torch.aten.unsqueeze"(%33411, %33412) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33413, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33414 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33415 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33416 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33417 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33418 = "torch.aten.slice.Tensor"(%33413, %33414, %33415, %33416, %33417) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33418, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33419 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33420 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33421 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33422 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33423 = "torch.prim.ListConstruct"(%33419, %33420, %33421, %33422) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33424 = "torch.aten.repeat"(%33418, %33423) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33424, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %33425 = "torch.aten.mul.Tensor"(%33137, %33404) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33425, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33426 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33427 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33428 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %33429 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33430 = "torch.aten.slice.Tensor"(%33137, %33426, %33427, %33428, %33429) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33430, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33431 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33432 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %33433 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33434 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33435 = "torch.aten.slice.Tensor"(%33137, %33431, %33432, %33433, %33434) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33435, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33436 = "torch.aten.neg"(%33435) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33436, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33437 = "torch.prim.ListConstruct"(%33436, %33430) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %33438 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33439 = "torch.aten.cat"(%33437, %33438) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33439, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33440 = "torch.aten.mul.Tensor"(%33439, %33424) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33440, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33441 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33442 = "torch.aten.add.Tensor"(%33425, %33440, %33441) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33442, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33443 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %33444 = "torch.aten.mul.Scalar"(%arg69, %33443) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%33444, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %33445 = "torch.constant.int"() <{value = 44 : i64}> : () -> !torch.int
    %33446 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33447 = "torch.aten.add.Scalar"(%33444, %33445, %33446) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%33447, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %33448 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33449 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33450 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33451 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33452 = "torch.prim.ListConstruct"(%33448, %18477, %33449, %33450, %33451) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33453 = "torch.aten.view"(%33442, %33452) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33453, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33454 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33455 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33456 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33457 = "torch.prim.ListConstruct"(%19011, %33454, %33455, %33456) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33458 = "torch.aten.view"(%33453, %33457) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33458, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33459 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %33460 = "torch.aten.view"(%33447, %33459) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%33460, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %33461 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33462 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33463 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33464 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33465 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33466 = "torch.prim.ListConstruct"(%18479, %33461, %33462, %33463, %33464, %33465) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33467 = "torch.aten.view"(%32869, %33466) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33467, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33468 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33469 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33470 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33471 = "torch.prim.ListConstruct"(%18993, %33468, %33469, %33470) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33472 = "torch.aten.view"(%33467, %33471) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33472, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33473 = "torch.prim.ListConstruct"(%33460) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %33474 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33475 = "torch.aten.index_put"(%33472, %33473, %33458, %33474) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33475, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33476 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33477 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33478 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33479 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33480 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33481 = "torch.prim.ListConstruct"(%18479, %33476, %33477, %33478, %33479, %33480) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33482 = "torch.aten.view"(%33475, %33481) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33482, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33483 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %33484 = "torch.prim.ListConstruct"(%18479, %33483) : (!torch.int, !torch.int) -> !torch.list<int>
    %33485 = "torch.aten.view"(%33482, %33484) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33485, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %33486 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33487 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33488 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33489 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33490 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33491 = "torch.prim.ListConstruct"(%18479, %33486, %33487, %33488, %33489, %33490) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33492 = "torch.aten.view"(%33485, %33491) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33492, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33493 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33494 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33495 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33496 = "torch.prim.ListConstruct"(%18993, %33493, %33494, %33495) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33497 = "torch.aten.view"(%33492, %33496) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33497, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33498 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33499 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33500 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33501 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33502 = "torch.prim.ListConstruct"(%33498, %18477, %33499, %33500, %33501) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33503 = "torch.aten.view"(%33142, %33502) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33503, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33504 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33505 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33506 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33507 = "torch.prim.ListConstruct"(%19011, %33504, %33505, %33506) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33508 = "torch.aten.view"(%33503, %33507) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33508, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33509 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33510 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33511 = "torch.aten.add.Scalar"(%33447, %33509, %33510) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%33511, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %33512 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %33513 = "torch.aten.view"(%33511, %33512) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%33513, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %33514 = "torch.prim.ListConstruct"(%33513) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %33515 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33516 = "torch.aten.index_put"(%33497, %33514, %33508, %33515) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33516, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33517 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33518 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33519 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33520 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33521 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33522 = "torch.prim.ListConstruct"(%18479, %33517, %33518, %33519, %33520, %33521) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33523 = "torch.aten.view"(%33516, %33522) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33523, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33524 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %33525 = "torch.prim.ListConstruct"(%18479, %33524) : (!torch.int, !torch.int) -> !torch.list<int>
    %33526 = "torch.aten.view"(%33523, %33525) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33526, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %33527 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %33528 = "torch.aten.unsqueeze"(%33442, %33527) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33528, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33529 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33530 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33531 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33532 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33533 = "torch.prim.ListConstruct"(%33529, %18481, %33530, %33531, %33532) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33534 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33535 = "torch.aten.expand"(%33528, %33533, %33534) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33535, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33536 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33537 = "torch.aten.clone"(%33535, %33536) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33537, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33538 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33539 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33540 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33541 = "torch.prim.ListConstruct"(%33538, %18481, %33539, %33540) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33542 = "torch.aten._unsafe_view"(%33537, %33541) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33542, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33543 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %33544 = "torch.aten.unsqueeze"(%33142, %33543) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33544, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33545 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33546 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33547 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33548 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33549 = "torch.prim.ListConstruct"(%33545, %18481, %33546, %33547, %33548) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33550 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33551 = "torch.aten.expand"(%33544, %33549, %33550) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33551, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33552 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33553 = "torch.aten.clone"(%33551, %33552) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33553, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33554 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33555 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33556 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33557 = "torch.prim.ListConstruct"(%33554, %18481, %33555, %33556) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33558 = "torch.aten._unsafe_view"(%33553, %33557) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33558, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33559 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33560 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33561 = "torch.aten.transpose.int"(%33292, %33559, %33560) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33561, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33562 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33563 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33564 = "torch.aten.transpose.int"(%33542, %33562, %33563) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33564, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33565 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33566 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33567 = "torch.aten.transpose.int"(%33558, %33565, %33566) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33567, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33568 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33569 = "torch.aten.squeeze.dim"(%18570, %33568) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33569, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %33570 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33571 = "torch.aten.squeeze.dim"(%33569, %33570) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33571, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %33572 = "torch_c.to_builtin_tensor"(%33561) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %33573 = "torch_c.to_builtin_tensor"(%33564) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %33574 = "torch_c.to_builtin_tensor"(%33567) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %33575 = "torch_c.to_builtin_tensor"(%33571) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %33576 = "tensor.cast"(%33575) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %33577 = "torch_c.to_builtin_tensor"(%18089) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %33578 = "util.call"(%33572, %33573, %33574, %33577, %33576) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %33579 = "torch_c.from_builtin_tensor"(%33578) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%33579, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %33580 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33581 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33582 = "torch.aten.transpose.int"(%33579, %33580, %33581) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%33582, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %33583 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33584 = "torch.aten.clone"(%33582, %33583) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%33584, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %33585 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33586 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33587 = "torch.prim.ListConstruct"(%33585, %18481, %33586) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33588 = "torch.aten._unsafe_view"(%33584, %33587) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33588, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33589 = "torch.aten.div.Tensor"(%33588, %18091) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33589, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33590 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33591 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33592 = "torch.aten.clamp"(%33589, %33590, %33591) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33592, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33593 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33594 = "torch.prims.convert_element_type"(%33592, %33593) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33594, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33595 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33596 = "torch.aten.unsqueeze"(%18093, %33595) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %33597 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33598 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33599 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33600 = "torch.prim.ListConstruct"(%33597, %33598, %33599) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33601 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33602 = "torch.aten.expand"(%33596, %33600, %33601) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %33603 = "torch_c.to_builtin_tensor"(%33594) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33604 = "torch_c.to_builtin_tensor"(%33602) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %33605 = "util.call"(%33603, %33604) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %33606 = "torch_c.from_builtin_tensor"(%33605) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33606, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33607 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33608 = "torch.prims.convert_element_type"(%33606, %33607) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33608, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33609 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33610 = "torch.aten.add.Tensor"(%33036, %33608, %33609) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33610, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33611 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33612 = "torch.prims.convert_element_type"(%33610, %33611) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33612, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33613 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33614 = "torch.aten.pow.Tensor_Scalar"(%33612, %33613) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33614, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33615 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33616 = "torch.prim.ListConstruct"(%33615) : (!torch.int) -> !torch.list<int>
    %33617 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %33618 = "torch.constant.none"() : () -> !torch.none
    %33619 = "torch.aten.mean.dim"(%33614, %33616, %33617, %33618) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33619, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33620 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %33621 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33622 = "torch.aten.add.Scalar"(%33619, %33620, %33621) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33622, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33623 = "torch.aten.rsqrt"(%33622) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33623, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33624 = "torch.aten.mul.Tensor"(%33612, %33623) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33624, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33625 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33626 = "torch.prims.convert_element_type"(%33624, %33625) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33626, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33627 = "torch.aten.mul.Tensor"(%18095, %33626) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33627, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33628 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33629 = "torch.prims.convert_element_type"(%33627, %33628) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33629, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33630 = "torch.aten.div.Tensor"(%33629, %18097) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33630, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33631 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33632 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33633 = "torch.aten.clamp"(%33630, %33631, %33632) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33633, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33634 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33635 = "torch.prims.convert_element_type"(%33633, %33634) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33635, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33636 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33637 = "torch.aten.unsqueeze"(%18099, %33636) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %33638 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33639 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %33640 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33641 = "torch.prim.ListConstruct"(%33638, %33639, %33640) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33642 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33643 = "torch.aten.expand"(%33637, %33641, %33642) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %33644 = "torch_c.to_builtin_tensor"(%33635) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33645 = "torch_c.to_builtin_tensor"(%33643) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %33646 = "util.call"(%33644, %33645) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %33647 = "torch_c.from_builtin_tensor"(%33646) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%33647, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %33648 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33649 = "torch.prims.convert_element_type"(%33647, %33648) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33649, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33650 = "torch.aten.silu"(%33649) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33650, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33651 = "torch.aten.div.Tensor"(%33629, %18101) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33651, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33652 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33653 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33654 = "torch.aten.clamp"(%33651, %33652, %33653) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33654, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33655 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33656 = "torch.prims.convert_element_type"(%33654, %33655) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33656, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33657 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33658 = "torch.aten.unsqueeze"(%18103, %33657) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %33659 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33660 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %33661 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33662 = "torch.prim.ListConstruct"(%33659, %33660, %33661) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33663 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33664 = "torch.aten.expand"(%33658, %33662, %33663) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %33665 = "torch_c.to_builtin_tensor"(%33656) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33666 = "torch_c.to_builtin_tensor"(%33664) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %33667 = "util.call"(%33665, %33666) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %33668 = "torch_c.from_builtin_tensor"(%33667) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%33668, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %33669 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33670 = "torch.prims.convert_element_type"(%33668, %33669) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33670, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33671 = "torch.aten.mul.Tensor"(%33650, %33670) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33671, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33672 = "torch.aten.div.Tensor"(%33671, %18105) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33672, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33673 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33674 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33675 = "torch.aten.clamp"(%33672, %33673, %33674) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%33675, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %33676 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33677 = "torch.prims.convert_element_type"(%33675, %33676) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33677, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %33678 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33679 = "torch.aten.unsqueeze"(%18107, %33678) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %33680 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33681 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33682 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %33683 = "torch.prim.ListConstruct"(%33680, %33681, %33682) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33684 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33685 = "torch.aten.expand"(%33679, %33683, %33684) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %33686 = "torch_c.to_builtin_tensor"(%33677) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %33687 = "torch_c.to_builtin_tensor"(%33685) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %33688 = "util.call"(%33686, %33687) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %33689 = "torch_c.from_builtin_tensor"(%33688) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33689, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33690 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33691 = "torch.prims.convert_element_type"(%33689, %33690) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33691, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33692 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33693 = "torch.aten.add.Tensor"(%33610, %33691, %33692) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33693, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33694 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33695 = "torch.prims.convert_element_type"(%33693, %33694) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33695, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33696 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33697 = "torch.aten.pow.Tensor_Scalar"(%33695, %33696) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33697, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33698 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33699 = "torch.prim.ListConstruct"(%33698) : (!torch.int) -> !torch.list<int>
    %33700 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %33701 = "torch.constant.none"() : () -> !torch.none
    %33702 = "torch.aten.mean.dim"(%33697, %33699, %33700, %33701) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33702, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33703 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %33704 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33705 = "torch.aten.add.Scalar"(%33702, %33703, %33704) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33705, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33706 = "torch.aten.rsqrt"(%33705) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%33706, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %33707 = "torch.aten.mul.Tensor"(%33695, %33706) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33707, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33708 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33709 = "torch.prims.convert_element_type"(%33707, %33708) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33709, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33710 = "torch.aten.mul.Tensor"(%18109, %33709) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33710, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33711 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33712 = "torch.prims.convert_element_type"(%33710, %33711) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33712, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33713 = "torch.aten.div.Tensor"(%33712, %18111) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33713, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33714 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33715 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33716 = "torch.aten.clamp"(%33713, %33714, %33715) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33716, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33717 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33718 = "torch.prims.convert_element_type"(%33716, %33717) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33718, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33719 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33720 = "torch.aten.unsqueeze"(%18113, %33719) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %33721 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33722 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33723 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33724 = "torch.prim.ListConstruct"(%33721, %33722, %33723) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33725 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33726 = "torch.aten.expand"(%33720, %33724, %33725) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %33727 = "torch_c.to_builtin_tensor"(%33718) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33728 = "torch_c.to_builtin_tensor"(%33726) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %33729 = "util.call"(%33727, %33728) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %33730 = "torch_c.from_builtin_tensor"(%33729) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33730, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33731 = "torch.aten.div.Tensor"(%33730, %18115) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33731, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33732 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33733 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33734 = "torch.aten.clamp"(%33731, %33732, %33733) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%33734, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %33735 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33736 = "torch.prims.convert_element_type"(%33734, %33735) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33736, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33737 = "torch.aten.div.Tensor"(%33712, %18117) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33737, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33738 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33739 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33740 = "torch.aten.clamp"(%33737, %33738, %33739) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33740, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33741 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33742 = "torch.prims.convert_element_type"(%33740, %33741) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33742, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33743 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33744 = "torch.aten.unsqueeze"(%18119, %33743) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %33745 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33746 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %33747 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33748 = "torch.prim.ListConstruct"(%33745, %33746, %33747) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33749 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33750 = "torch.aten.expand"(%33744, %33748, %33749) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %33751 = "torch_c.to_builtin_tensor"(%33742) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33752 = "torch_c.to_builtin_tensor"(%33750) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %33753 = "util.call"(%33751, %33752) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %33754 = "torch_c.from_builtin_tensor"(%33753) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33754, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33755 = "torch.aten.div.Tensor"(%33754, %18121) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33755, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33756 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33757 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33758 = "torch.aten.clamp"(%33755, %33756, %33757) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33758, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33759 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33760 = "torch.prims.convert_element_type"(%33758, %33759) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33760, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %33761 = "torch.aten.div.Tensor"(%33712, %18123) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33761, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33762 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33763 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33764 = "torch.aten.clamp"(%33761, %33762, %33763) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%33764, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %33765 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33766 = "torch.prims.convert_element_type"(%33764, %33765) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33766, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %33767 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33768 = "torch.aten.unsqueeze"(%18125, %33767) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %33769 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33770 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %33771 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %33772 = "torch.prim.ListConstruct"(%33769, %33770, %33771) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33773 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33774 = "torch.aten.expand"(%33768, %33772, %33773) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %33775 = "torch_c.to_builtin_tensor"(%33766) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %33776 = "torch_c.to_builtin_tensor"(%33774) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %33777 = "util.call"(%33775, %33776) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %33778 = "torch_c.from_builtin_tensor"(%33777) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33778, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33779 = "torch.aten.div.Tensor"(%33778, %18127) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33779, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33780 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %33781 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %33782 = "torch.aten.clamp"(%33779, %33780, %33781) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%33782, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %33783 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %33784 = "torch.prims.convert_element_type"(%33782, %33783) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33784, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %33785 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33786 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %33787 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33788 = "torch.prim.ListConstruct"(%33785, %18481, %33786, %33787) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33789 = "torch.aten.view"(%33736, %33788) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33789, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33790 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33791 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33792 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33793 = "torch.prim.ListConstruct"(%33790, %18481, %33791, %33792) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33794 = "torch.aten.view"(%33760, %33793) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33794, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33795 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33796 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33797 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33798 = "torch.prim.ListConstruct"(%33795, %18481, %33796, %33797) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33799 = "torch.aten.view"(%33784, %33798) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33799, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33800 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %33801 = "torch.constant.none"() : () -> !torch.none
    %33802 = "torch.constant.none"() : () -> !torch.none
    %33803 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33804 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33805 = "torch.aten.arange"(%33800, %33801, %33802, %33803, %33804) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %33806 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33807 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33808 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33809 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33810 = "torch.constant.none"() : () -> !torch.none
    %33811 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33812 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33813 = "torch.aten.arange.start_step"(%33806, %33807, %33808, %33809, %33810, %33811, %33812) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %33814 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33815 = "torch.prims.convert_element_type"(%33813, %33814) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %33816 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33817 = "torch.aten.div.Scalar"(%33815, %33816) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33818 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %33819 = "torch.aten.pow.Scalar"(%33818, %33817) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33820 = "torch.aten.reciprocal"(%33819) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33821 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %33822 = "torch.aten.mul.Scalar"(%33820, %33821) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33823 = "torch.aten.reciprocal"(%33822) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33824 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %33825 = "torch.aten.mul.Scalar"(%33823, %33824) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33826 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %33827 = "torch.aten.gt.Scalar"(%33825, %33826) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33828 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33829 = "torch.aten.div.Scalar"(%33822, %33828) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33830 = "torch.aten.where.self"(%33827, %33829, %33822) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33831 = "torch.aten.reciprocal"(%33825) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33832 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %33833 = "torch.aten.mul.Scalar"(%33831, %33832) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33834 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33835 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33836 = "torch.aten.sub.Scalar"(%33833, %33834, %33835) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33837 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33838 = "torch.aten.div.Scalar"(%33836, %33837) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33839 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33840 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33841 = "torch.aten.rsub.Scalar"(%33838, %33839, %33840) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33842 = "torch.aten.mul.Tensor"(%33841, %33830) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33843 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33844 = "torch.aten.div.Scalar"(%33842, %33843) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33845 = "torch.aten.mul.Tensor"(%33838, %33830) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33846 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33847 = "torch.aten.add.Tensor"(%33844, %33845, %33846) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33848 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %33849 = "torch.aten.lt.Scalar"(%33825, %33848) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33850 = "torch.aten.bitwise_not"(%33849) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33851 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %33852 = "torch.aten.gt.Scalar"(%33825, %33851) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33853 = "torch.aten.bitwise_not"(%33852) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33854 = "torch.aten.mul.Tensor"(%33850, %33853) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %33855 = "torch.aten.where.self"(%33854, %33847, %33830) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33856 = "torch.prim.ListConstruct"(%33855, %33855) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %33857 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33858 = "torch.aten.cat"(%33856, %33857) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %33859 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33860 = "torch.prims.convert_element_type"(%33805, %33859) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %33861 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33862 = "torch.prims.convert_element_type"(%33858, %33861) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %33863 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %33864 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33865 = "torch.prim.ListConstruct"(%33863, %33864) : (!torch.int, !torch.int) -> !torch.list<int>
    %33866 = "torch.aten.view"(%33860, %33865) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %33867 = "torch.aten.mul.Tensor"(%33866, %33862) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33868 = "torch.aten.cos"(%33867) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33869 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33870 = "torch.prims.convert_element_type"(%33868, %33869) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %33871 = "torch.aten.sin"(%33867) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %33872 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %33873 = "torch.prims.convert_element_type"(%33871, %33872) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %33874 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33875 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33876 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33877 = "torch.aten.slice.Tensor"(%33870, %33874, %33875, %18481, %33876) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33877, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33879 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33880 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33881 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33882 = "torch.aten.slice.Tensor"(%33877, %33878, %33879, %33880, %33881) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33882, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33883 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33884 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33885 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33886 = "torch.aten.slice.Tensor"(%33873, %33883, %33884, %18481, %33885) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33886, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33887 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33888 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33889 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33890 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33891 = "torch.aten.slice.Tensor"(%33886, %33887, %33888, %33889, %33890) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%33891, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %33892 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33893 = "torch.aten.unsqueeze"(%33882, %33892) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33893, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33894 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33895 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33896 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33897 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33898 = "torch.aten.slice.Tensor"(%33893, %33894, %33895, %33896, %33897) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33898, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33899 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33900 = "torch.aten.unsqueeze"(%33898, %33899) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33900, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33901 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33902 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33903 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33904 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33905 = "torch.aten.slice.Tensor"(%33900, %33901, %33902, %33903, %33904) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33905, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33906 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33908 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33909 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33910 = "torch.prim.ListConstruct"(%33906, %33907, %33908, %33909) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33911 = "torch.aten.repeat"(%33905, %33910) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33911, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %33912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33913 = "torch.aten.unsqueeze"(%33891, %33912) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33913, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33915 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33916 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33917 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33918 = "torch.aten.slice.Tensor"(%33913, %33914, %33915, %33916, %33917) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%33918, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %33919 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33920 = "torch.aten.unsqueeze"(%33918, %33919) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33920, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33921 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33922 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33923 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33924 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33925 = "torch.aten.slice.Tensor"(%33920, %33921, %33922, %33923, %33924) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33925, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %33926 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33928 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33929 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33930 = "torch.prim.ListConstruct"(%33926, %33927, %33928, %33929) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %33931 = "torch.aten.repeat"(%33925, %33930) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%33931, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %33932 = "torch.aten.mul.Tensor"(%33789, %33911) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33932, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33933 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33934 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33935 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %33936 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33937 = "torch.aten.slice.Tensor"(%33789, %33933, %33934, %33935, %33936) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33937, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33938 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33939 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %33940 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %33941 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33942 = "torch.aten.slice.Tensor"(%33789, %33938, %33939, %33940, %33941) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33942, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33943 = "torch.aten.neg"(%33942) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33943, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %33944 = "torch.prim.ListConstruct"(%33943, %33937) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %33945 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %33946 = "torch.aten.cat"(%33944, %33945) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33946, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33947 = "torch.aten.mul.Tensor"(%33946, %33931) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33947, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33948 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33949 = "torch.aten.add.Tensor"(%33932, %33947, %33948) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%33949, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %33950 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %33951 = "torch.constant.none"() : () -> !torch.none
    %33952 = "torch.constant.none"() : () -> !torch.none
    %33953 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33954 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33955 = "torch.aten.arange"(%33950, %33951, %33952, %33953, %33954) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %33956 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %33957 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33958 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %33959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %33960 = "torch.constant.none"() : () -> !torch.none
    %33961 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %33962 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %33963 = "torch.aten.arange.start_step"(%33956, %33957, %33958, %33959, %33960, %33961, %33962) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %33964 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %33965 = "torch.prims.convert_element_type"(%33963, %33964) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %33966 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %33967 = "torch.aten.div.Scalar"(%33965, %33966) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33968 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %33969 = "torch.aten.pow.Scalar"(%33968, %33967) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33970 = "torch.aten.reciprocal"(%33969) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33971 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %33972 = "torch.aten.mul.Scalar"(%33970, %33971) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33973 = "torch.aten.reciprocal"(%33972) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33974 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %33975 = "torch.aten.mul.Scalar"(%33973, %33974) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %33976 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %33977 = "torch.aten.gt.Scalar"(%33975, %33976) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %33978 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33979 = "torch.aten.div.Scalar"(%33972, %33978) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33980 = "torch.aten.where.self"(%33977, %33979, %33972) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33981 = "torch.aten.reciprocal"(%33975) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33982 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %33983 = "torch.aten.mul.Scalar"(%33981, %33982) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33984 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33985 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33986 = "torch.aten.sub.Scalar"(%33983, %33984, %33985) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33987 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %33988 = "torch.aten.div.Scalar"(%33986, %33987) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33989 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33990 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33991 = "torch.aten.rsub.Scalar"(%33988, %33989, %33990) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %33992 = "torch.aten.mul.Tensor"(%33991, %33980) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33993 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %33994 = "torch.aten.div.Scalar"(%33992, %33993) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33995 = "torch.aten.mul.Tensor"(%33988, %33980) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %33996 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %33997 = "torch.aten.add.Tensor"(%33994, %33995, %33996) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %33998 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %33999 = "torch.aten.lt.Scalar"(%33975, %33998) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34000 = "torch.aten.bitwise_not"(%33999) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34001 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %34002 = "torch.aten.gt.Scalar"(%33975, %34001) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34003 = "torch.aten.bitwise_not"(%34002) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34004 = "torch.aten.mul.Tensor"(%34000, %34003) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34005 = "torch.aten.where.self"(%34004, %33997, %33980) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34006 = "torch.prim.ListConstruct"(%34005, %34005) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %34007 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34008 = "torch.aten.cat"(%34006, %34007) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %34009 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34010 = "torch.prims.convert_element_type"(%33955, %34009) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %34011 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34012 = "torch.prims.convert_element_type"(%34008, %34011) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %34013 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %34014 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34015 = "torch.prim.ListConstruct"(%34013, %34014) : (!torch.int, !torch.int) -> !torch.list<int>
    %34016 = "torch.aten.view"(%34010, %34015) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %34017 = "torch.aten.mul.Tensor"(%34016, %34012) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34018 = "torch.aten.cos"(%34017) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34019 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34020 = "torch.prims.convert_element_type"(%34018, %34019) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %34021 = "torch.aten.sin"(%34017) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34022 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34023 = "torch.prims.convert_element_type"(%34021, %34022) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %34024 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34025 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34026 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34027 = "torch.aten.slice.Tensor"(%34020, %34024, %34025, %18481, %34026) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34027, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34028 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34029 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34030 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34031 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34032 = "torch.aten.slice.Tensor"(%34027, %34028, %34029, %34030, %34031) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34032, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34033 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34034 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34035 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34036 = "torch.aten.slice.Tensor"(%34023, %34033, %34034, %18481, %34035) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34036, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34037 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34038 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34039 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34040 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34041 = "torch.aten.slice.Tensor"(%34036, %34037, %34038, %34039, %34040) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34041, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34042 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34043 = "torch.aten.unsqueeze"(%34032, %34042) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34043, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34044 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34045 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34046 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34047 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34048 = "torch.aten.slice.Tensor"(%34043, %34044, %34045, %34046, %34047) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34048, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34049 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34050 = "torch.aten.unsqueeze"(%34048, %34049) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34050, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34051 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34052 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34053 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34054 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34055 = "torch.aten.slice.Tensor"(%34050, %34051, %34052, %34053, %34054) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34055, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34056 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34057 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34058 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34059 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34060 = "torch.prim.ListConstruct"(%34056, %34057, %34058, %34059) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34061 = "torch.aten.repeat"(%34055, %34060) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34061, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %34062 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34063 = "torch.aten.unsqueeze"(%34041, %34062) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34063, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34064 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34065 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34066 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34067 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34068 = "torch.aten.slice.Tensor"(%34063, %34064, %34065, %34066, %34067) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34068, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34069 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34070 = "torch.aten.unsqueeze"(%34068, %34069) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34070, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34071 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34072 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34073 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34074 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34075 = "torch.aten.slice.Tensor"(%34070, %34071, %34072, %34073, %34074) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34075, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34076 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34077 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34078 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34079 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34080 = "torch.prim.ListConstruct"(%34076, %34077, %34078, %34079) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34081 = "torch.aten.repeat"(%34075, %34080) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34081, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %34082 = "torch.aten.mul.Tensor"(%33794, %34061) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34082, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34083 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34084 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34085 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34086 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34087 = "torch.aten.slice.Tensor"(%33794, %34083, %34084, %34085, %34086) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34087, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34088 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34089 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34090 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34091 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34092 = "torch.aten.slice.Tensor"(%33794, %34088, %34089, %34090, %34091) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34092, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34093 = "torch.aten.neg"(%34092) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34093, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34094 = "torch.prim.ListConstruct"(%34093, %34087) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %34095 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34096 = "torch.aten.cat"(%34094, %34095) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34096, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34097 = "torch.aten.mul.Tensor"(%34096, %34081) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34097, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34098 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34099 = "torch.aten.add.Tensor"(%34082, %34097, %34098) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34099, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34100 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34101 = "torch.aten.mul.Scalar"(%arg69, %34100) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%34101, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %34102 = "torch.constant.int"() <{value = 46 : i64}> : () -> !torch.int
    %34103 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34104 = "torch.aten.add.Scalar"(%34101, %34102, %34103) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%34104, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %34105 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34106 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34107 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34108 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34109 = "torch.prim.ListConstruct"(%34105, %18477, %34106, %34107, %34108) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34110 = "torch.aten.view"(%34099, %34109) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34110, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34111 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34112 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34113 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34114 = "torch.prim.ListConstruct"(%19011, %34111, %34112, %34113) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34115 = "torch.aten.view"(%34110, %34114) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34115, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34116 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %34117 = "torch.aten.view"(%34104, %34116) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%34117, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %34118 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34119 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34120 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34121 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34122 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34123 = "torch.prim.ListConstruct"(%18479, %34118, %34119, %34120, %34121, %34122) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34124 = "torch.aten.view"(%33526, %34123) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34124, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34125 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34126 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34127 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34128 = "torch.prim.ListConstruct"(%18993, %34125, %34126, %34127) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34129 = "torch.aten.view"(%34124, %34128) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34129, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34130 = "torch.prim.ListConstruct"(%34117) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %34131 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34132 = "torch.aten.index_put"(%34129, %34130, %34115, %34131) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34132, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34133 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34134 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34135 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34136 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34137 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34138 = "torch.prim.ListConstruct"(%18479, %34133, %34134, %34135, %34136, %34137) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34139 = "torch.aten.view"(%34132, %34138) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34139, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34140 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %34141 = "torch.prim.ListConstruct"(%18479, %34140) : (!torch.int, !torch.int) -> !torch.list<int>
    %34142 = "torch.aten.view"(%34139, %34141) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34142, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %34143 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34144 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34145 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34146 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34147 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34148 = "torch.prim.ListConstruct"(%18479, %34143, %34144, %34145, %34146, %34147) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34149 = "torch.aten.view"(%34142, %34148) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34149, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34150 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34151 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34152 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34153 = "torch.prim.ListConstruct"(%18993, %34150, %34151, %34152) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34154 = "torch.aten.view"(%34149, %34153) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34154, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34155 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34156 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34157 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34158 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34159 = "torch.prim.ListConstruct"(%34155, %18477, %34156, %34157, %34158) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34160 = "torch.aten.view"(%33799, %34159) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34160, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34161 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34162 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34163 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34164 = "torch.prim.ListConstruct"(%19011, %34161, %34162, %34163) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34165 = "torch.aten.view"(%34160, %34164) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34165, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34166 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34167 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34168 = "torch.aten.add.Scalar"(%34104, %34166, %34167) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%34168, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %34169 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %34170 = "torch.aten.view"(%34168, %34169) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%34170, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %34171 = "torch.prim.ListConstruct"(%34170) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %34172 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34173 = "torch.aten.index_put"(%34154, %34171, %34165, %34172) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34173, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34174 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34175 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34176 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34177 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34178 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34179 = "torch.prim.ListConstruct"(%18479, %34174, %34175, %34176, %34177, %34178) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34180 = "torch.aten.view"(%34173, %34179) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34180, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34181 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %34182 = "torch.prim.ListConstruct"(%18479, %34181) : (!torch.int, !torch.int) -> !torch.list<int>
    %34183 = "torch.aten.view"(%34180, %34182) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34183, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %34184 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %34185 = "torch.aten.unsqueeze"(%34099, %34184) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34185, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34186 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34187 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34188 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34189 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34190 = "torch.prim.ListConstruct"(%34186, %18481, %34187, %34188, %34189) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34191 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34192 = "torch.aten.expand"(%34185, %34190, %34191) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34192, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34193 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34194 = "torch.aten.clone"(%34192, %34193) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34194, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34195 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34196 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34197 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34198 = "torch.prim.ListConstruct"(%34195, %18481, %34196, %34197) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34199 = "torch.aten._unsafe_view"(%34194, %34198) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34199, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34200 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %34201 = "torch.aten.unsqueeze"(%33799, %34200) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34201, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34202 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34203 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34204 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34205 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34206 = "torch.prim.ListConstruct"(%34202, %18481, %34203, %34204, %34205) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34207 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34208 = "torch.aten.expand"(%34201, %34206, %34207) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34208, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34209 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34210 = "torch.aten.clone"(%34208, %34209) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34210, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34211 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34212 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34213 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34214 = "torch.prim.ListConstruct"(%34211, %18481, %34212, %34213) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34215 = "torch.aten._unsafe_view"(%34210, %34214) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34215, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34216 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34217 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34218 = "torch.aten.transpose.int"(%33949, %34216, %34217) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34218, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34219 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34220 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34221 = "torch.aten.transpose.int"(%34199, %34219, %34220) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34221, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34222 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34223 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34224 = "torch.aten.transpose.int"(%34215, %34222, %34223) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34224, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34225 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34226 = "torch.aten.squeeze.dim"(%18570, %34225) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34226, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %34227 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34228 = "torch.aten.squeeze.dim"(%34226, %34227) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34228, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %34229 = "torch_c.to_builtin_tensor"(%34218) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %34230 = "torch_c.to_builtin_tensor"(%34221) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %34231 = "torch_c.to_builtin_tensor"(%34224) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %34232 = "torch_c.to_builtin_tensor"(%34228) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %34233 = "tensor.cast"(%34232) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %34234 = "torch_c.to_builtin_tensor"(%18129) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %34235 = "util.call"(%34229, %34230, %34231, %34234, %34233) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %34236 = "torch_c.from_builtin_tensor"(%34235) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%34236, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %34237 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34238 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34239 = "torch.aten.transpose.int"(%34236, %34237, %34238) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%34239, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %34240 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34241 = "torch.aten.clone"(%34239, %34240) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%34241, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %34242 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34243 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34244 = "torch.prim.ListConstruct"(%34242, %18481, %34243) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34245 = "torch.aten._unsafe_view"(%34241, %34244) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34245, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34246 = "torch.aten.div.Tensor"(%34245, %18131) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34246, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34247 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34248 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34249 = "torch.aten.clamp"(%34246, %34247, %34248) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34249, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34250 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34251 = "torch.prims.convert_element_type"(%34249, %34250) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34251, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34252 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34253 = "torch.aten.unsqueeze"(%18133, %34252) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %34254 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34255 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34256 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34257 = "torch.prim.ListConstruct"(%34254, %34255, %34256) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34258 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34259 = "torch.aten.expand"(%34253, %34257, %34258) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %34260 = "torch_c.to_builtin_tensor"(%34251) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34261 = "torch_c.to_builtin_tensor"(%34259) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %34262 = "util.call"(%34260, %34261) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %34263 = "torch_c.from_builtin_tensor"(%34262) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34263, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34264 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34265 = "torch.prims.convert_element_type"(%34263, %34264) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34265, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34266 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34267 = "torch.aten.add.Tensor"(%33693, %34265, %34266) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34267, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34268 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34269 = "torch.prims.convert_element_type"(%34267, %34268) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34269, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34270 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34271 = "torch.aten.pow.Tensor_Scalar"(%34269, %34270) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34271, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34272 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34273 = "torch.prim.ListConstruct"(%34272) : (!torch.int) -> !torch.list<int>
    %34274 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %34275 = "torch.constant.none"() : () -> !torch.none
    %34276 = "torch.aten.mean.dim"(%34271, %34273, %34274, %34275) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34276, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34277 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %34278 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34279 = "torch.aten.add.Scalar"(%34276, %34277, %34278) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34279, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34280 = "torch.aten.rsqrt"(%34279) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34280, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34281 = "torch.aten.mul.Tensor"(%34269, %34280) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34281, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34282 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34283 = "torch.prims.convert_element_type"(%34281, %34282) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34283, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34284 = "torch.aten.mul.Tensor"(%18135, %34283) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34284, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34285 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34286 = "torch.prims.convert_element_type"(%34284, %34285) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34286, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34287 = "torch.aten.div.Tensor"(%34286, %18137) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34287, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34288 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34289 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34290 = "torch.aten.clamp"(%34287, %34288, %34289) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34290, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34291 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34292 = "torch.prims.convert_element_type"(%34290, %34291) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34292, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34293 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34294 = "torch.aten.unsqueeze"(%18139, %34293) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %34295 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34296 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %34297 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34298 = "torch.prim.ListConstruct"(%34295, %34296, %34297) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34299 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34300 = "torch.aten.expand"(%34294, %34298, %34299) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %34301 = "torch_c.to_builtin_tensor"(%34292) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34302 = "torch_c.to_builtin_tensor"(%34300) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %34303 = "util.call"(%34301, %34302) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %34304 = "torch_c.from_builtin_tensor"(%34303) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%34304, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %34305 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34306 = "torch.prims.convert_element_type"(%34304, %34305) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34306, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34307 = "torch.aten.silu"(%34306) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34307, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34308 = "torch.aten.div.Tensor"(%34286, %18141) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34308, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34309 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34310 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34311 = "torch.aten.clamp"(%34308, %34309, %34310) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34311, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34312 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34313 = "torch.prims.convert_element_type"(%34311, %34312) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34313, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34314 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34315 = "torch.aten.unsqueeze"(%18143, %34314) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %34316 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34317 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %34318 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34319 = "torch.prim.ListConstruct"(%34316, %34317, %34318) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34320 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34321 = "torch.aten.expand"(%34315, %34319, %34320) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %34322 = "torch_c.to_builtin_tensor"(%34313) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34323 = "torch_c.to_builtin_tensor"(%34321) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %34324 = "util.call"(%34322, %34323) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %34325 = "torch_c.from_builtin_tensor"(%34324) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%34325, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %34326 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34327 = "torch.prims.convert_element_type"(%34325, %34326) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34327, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34328 = "torch.aten.mul.Tensor"(%34307, %34327) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34328, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34329 = "torch.aten.div.Tensor"(%34328, %18145) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34329, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34330 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34331 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34332 = "torch.aten.clamp"(%34329, %34330, %34331) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34332, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34333 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34334 = "torch.prims.convert_element_type"(%34332, %34333) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34334, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %34335 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34336 = "torch.aten.unsqueeze"(%18147, %34335) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %34337 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34338 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34339 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %34340 = "torch.prim.ListConstruct"(%34337, %34338, %34339) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34341 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34342 = "torch.aten.expand"(%34336, %34340, %34341) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %34343 = "torch_c.to_builtin_tensor"(%34334) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %34344 = "torch_c.to_builtin_tensor"(%34342) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %34345 = "util.call"(%34343, %34344) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %34346 = "torch_c.from_builtin_tensor"(%34345) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34346, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34347 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34348 = "torch.prims.convert_element_type"(%34346, %34347) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34348, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34349 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34350 = "torch.aten.add.Tensor"(%34267, %34348, %34349) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34350, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34351 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34352 = "torch.prims.convert_element_type"(%34350, %34351) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34352, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34353 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34354 = "torch.aten.pow.Tensor_Scalar"(%34352, %34353) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34354, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34355 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34356 = "torch.prim.ListConstruct"(%34355) : (!torch.int) -> !torch.list<int>
    %34357 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %34358 = "torch.constant.none"() : () -> !torch.none
    %34359 = "torch.aten.mean.dim"(%34354, %34356, %34357, %34358) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34359, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34360 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %34361 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34362 = "torch.aten.add.Scalar"(%34359, %34360, %34361) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34362, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34363 = "torch.aten.rsqrt"(%34362) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34363, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34364 = "torch.aten.mul.Tensor"(%34352, %34363) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34364, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34365 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34366 = "torch.prims.convert_element_type"(%34364, %34365) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34366, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34367 = "torch.aten.mul.Tensor"(%18149, %34366) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34367, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34368 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34369 = "torch.prims.convert_element_type"(%34367, %34368) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34369, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34370 = "torch.aten.div.Tensor"(%34369, %18151) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34370, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34371 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34372 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34373 = "torch.aten.clamp"(%34370, %34371, %34372) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34373, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34374 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34375 = "torch.prims.convert_element_type"(%34373, %34374) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34375, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34376 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34377 = "torch.aten.unsqueeze"(%18153, %34376) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %34378 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34379 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34380 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34381 = "torch.prim.ListConstruct"(%34378, %34379, %34380) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34382 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34383 = "torch.aten.expand"(%34377, %34381, %34382) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %34384 = "torch_c.to_builtin_tensor"(%34375) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34385 = "torch_c.to_builtin_tensor"(%34383) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %34386 = "util.call"(%34384, %34385) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %34387 = "torch_c.from_builtin_tensor"(%34386) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34387, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34388 = "torch.aten.div.Tensor"(%34387, %18155) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34388, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34389 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34390 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34391 = "torch.aten.clamp"(%34388, %34389, %34390) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34391, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34392 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34393 = "torch.prims.convert_element_type"(%34391, %34392) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34393, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34394 = "torch.aten.div.Tensor"(%34369, %18157) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34394, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34395 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34396 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34397 = "torch.aten.clamp"(%34394, %34395, %34396) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34397, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34398 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34399 = "torch.prims.convert_element_type"(%34397, %34398) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34399, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34400 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34401 = "torch.aten.unsqueeze"(%18159, %34400) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %34402 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34403 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %34404 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34405 = "torch.prim.ListConstruct"(%34402, %34403, %34404) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34406 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34407 = "torch.aten.expand"(%34401, %34405, %34406) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %34408 = "torch_c.to_builtin_tensor"(%34399) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34409 = "torch_c.to_builtin_tensor"(%34407) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %34410 = "util.call"(%34408, %34409) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %34411 = "torch_c.from_builtin_tensor"(%34410) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%34411, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %34412 = "torch.aten.div.Tensor"(%34411, %18161) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%34412, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %34413 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34414 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34415 = "torch.aten.clamp"(%34412, %34413, %34414) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%34415, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %34416 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34417 = "torch.prims.convert_element_type"(%34415, %34416) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34417, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %34418 = "torch.aten.div.Tensor"(%34369, %18163) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34418, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34419 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34420 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34421 = "torch.aten.clamp"(%34418, %34419, %34420) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34421, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34422 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34423 = "torch.prims.convert_element_type"(%34421, %34422) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34423, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34424 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34425 = "torch.aten.unsqueeze"(%18165, %34424) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %34426 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34427 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %34428 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34429 = "torch.prim.ListConstruct"(%34426, %34427, %34428) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34430 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34431 = "torch.aten.expand"(%34425, %34429, %34430) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %34432 = "torch_c.to_builtin_tensor"(%34423) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34433 = "torch_c.to_builtin_tensor"(%34431) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %34434 = "util.call"(%34432, %34433) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %34435 = "torch_c.from_builtin_tensor"(%34434) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%34435, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %34436 = "torch.aten.div.Tensor"(%34435, %18167) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%34436, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %34437 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34438 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34439 = "torch.aten.clamp"(%34436, %34437, %34438) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%34439, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %34440 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34441 = "torch.prims.convert_element_type"(%34439, %34440) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34441, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %34442 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34443 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34444 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34445 = "torch.prim.ListConstruct"(%34442, %18481, %34443, %34444) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34446 = "torch.aten.view"(%34393, %34445) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34446, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34447 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34448 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34449 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34450 = "torch.prim.ListConstruct"(%34447, %18481, %34448, %34449) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34451 = "torch.aten.view"(%34417, %34450) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34451, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34452 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34453 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34454 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34455 = "torch.prim.ListConstruct"(%34452, %18481, %34453, %34454) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34456 = "torch.aten.view"(%34441, %34455) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34456, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34457 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %34458 = "torch.constant.none"() : () -> !torch.none
    %34459 = "torch.constant.none"() : () -> !torch.none
    %34460 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %34461 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34462 = "torch.aten.arange"(%34457, %34458, %34459, %34460, %34461) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %34463 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34464 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34465 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34466 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34467 = "torch.constant.none"() : () -> !torch.none
    %34468 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %34469 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34470 = "torch.aten.arange.start_step"(%34463, %34464, %34465, %34466, %34467, %34468, %34469) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %34471 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34472 = "torch.prims.convert_element_type"(%34470, %34471) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %34473 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34474 = "torch.aten.div.Scalar"(%34472, %34473) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34475 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %34476 = "torch.aten.pow.Scalar"(%34475, %34474) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34477 = "torch.aten.reciprocal"(%34476) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34478 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %34479 = "torch.aten.mul.Scalar"(%34477, %34478) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %34480 = "torch.aten.reciprocal"(%34479) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34481 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %34482 = "torch.aten.mul.Scalar"(%34480, %34481) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %34483 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %34484 = "torch.aten.gt.Scalar"(%34482, %34483) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34485 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34486 = "torch.aten.div.Scalar"(%34479, %34485) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34487 = "torch.aten.where.self"(%34484, %34486, %34479) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34488 = "torch.aten.reciprocal"(%34482) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34489 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %34490 = "torch.aten.mul.Scalar"(%34488, %34489) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34491 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34492 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34493 = "torch.aten.sub.Scalar"(%34490, %34491, %34492) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %34494 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34495 = "torch.aten.div.Scalar"(%34493, %34494) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34496 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34497 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34498 = "torch.aten.rsub.Scalar"(%34495, %34496, %34497) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %34499 = "torch.aten.mul.Tensor"(%34498, %34487) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34500 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34501 = "torch.aten.div.Scalar"(%34499, %34500) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34502 = "torch.aten.mul.Tensor"(%34495, %34487) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34503 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34504 = "torch.aten.add.Tensor"(%34501, %34502, %34503) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34505 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %34506 = "torch.aten.lt.Scalar"(%34482, %34505) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34507 = "torch.aten.bitwise_not"(%34506) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34508 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %34509 = "torch.aten.gt.Scalar"(%34482, %34508) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34510 = "torch.aten.bitwise_not"(%34509) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34511 = "torch.aten.mul.Tensor"(%34507, %34510) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34512 = "torch.aten.where.self"(%34511, %34504, %34487) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34513 = "torch.prim.ListConstruct"(%34512, %34512) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %34514 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34515 = "torch.aten.cat"(%34513, %34514) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %34516 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34517 = "torch.prims.convert_element_type"(%34462, %34516) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %34518 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34519 = "torch.prims.convert_element_type"(%34515, %34518) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %34520 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %34521 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34522 = "torch.prim.ListConstruct"(%34520, %34521) : (!torch.int, !torch.int) -> !torch.list<int>
    %34523 = "torch.aten.view"(%34517, %34522) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %34524 = "torch.aten.mul.Tensor"(%34523, %34519) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34525 = "torch.aten.cos"(%34524) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34526 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34527 = "torch.prims.convert_element_type"(%34525, %34526) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %34528 = "torch.aten.sin"(%34524) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34529 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34530 = "torch.prims.convert_element_type"(%34528, %34529) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %34531 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34532 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34534 = "torch.aten.slice.Tensor"(%34527, %34531, %34532, %18481, %34533) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34534, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34536 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34537 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34538 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34539 = "torch.aten.slice.Tensor"(%34534, %34535, %34536, %34537, %34538) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34539, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34540 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34541 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34543 = "torch.aten.slice.Tensor"(%34530, %34540, %34541, %18481, %34542) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34543, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34544 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34545 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34546 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34547 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34548 = "torch.aten.slice.Tensor"(%34543, %34544, %34545, %34546, %34547) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34548, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34549 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34550 = "torch.aten.unsqueeze"(%34539, %34549) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34550, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34551 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34552 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34553 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34554 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34555 = "torch.aten.slice.Tensor"(%34550, %34551, %34552, %34553, %34554) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34555, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34556 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34557 = "torch.aten.unsqueeze"(%34555, %34556) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34557, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34558 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34559 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34560 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34561 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34562 = "torch.aten.slice.Tensor"(%34557, %34558, %34559, %34560, %34561) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34562, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34563 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34565 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34566 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34567 = "torch.prim.ListConstruct"(%34563, %34564, %34565, %34566) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34568 = "torch.aten.repeat"(%34562, %34567) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34568, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %34569 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34570 = "torch.aten.unsqueeze"(%34548, %34569) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34570, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34571 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34572 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34573 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34574 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34575 = "torch.aten.slice.Tensor"(%34570, %34571, %34572, %34573, %34574) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34575, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34576 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34577 = "torch.aten.unsqueeze"(%34575, %34576) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34577, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34578 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34580 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34581 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34582 = "torch.aten.slice.Tensor"(%34577, %34578, %34579, %34580, %34581) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34582, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34583 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34584 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34585 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34586 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34587 = "torch.prim.ListConstruct"(%34583, %34584, %34585, %34586) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34588 = "torch.aten.repeat"(%34582, %34587) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34588, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %34589 = "torch.aten.mul.Tensor"(%34446, %34568) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34589, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34590 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34591 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34592 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34593 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34594 = "torch.aten.slice.Tensor"(%34446, %34590, %34591, %34592, %34593) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34594, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34595 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34596 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34597 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34598 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34599 = "torch.aten.slice.Tensor"(%34446, %34595, %34596, %34597, %34598) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34599, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34600 = "torch.aten.neg"(%34599) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34600, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34601 = "torch.prim.ListConstruct"(%34600, %34594) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %34602 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34603 = "torch.aten.cat"(%34601, %34602) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34603, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34604 = "torch.aten.mul.Tensor"(%34603, %34588) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34604, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34605 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34606 = "torch.aten.add.Tensor"(%34589, %34604, %34605) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34606, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34607 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %34608 = "torch.constant.none"() : () -> !torch.none
    %34609 = "torch.constant.none"() : () -> !torch.none
    %34610 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %34611 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34612 = "torch.aten.arange"(%34607, %34608, %34609, %34610, %34611) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %34613 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34614 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34615 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34616 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34617 = "torch.constant.none"() : () -> !torch.none
    %34618 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %34619 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34620 = "torch.aten.arange.start_step"(%34613, %34614, %34615, %34616, %34617, %34618, %34619) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %34621 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34622 = "torch.prims.convert_element_type"(%34620, %34621) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %34623 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34624 = "torch.aten.div.Scalar"(%34622, %34623) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34625 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %34626 = "torch.aten.pow.Scalar"(%34625, %34624) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34627 = "torch.aten.reciprocal"(%34626) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34628 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %34629 = "torch.aten.mul.Scalar"(%34627, %34628) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %34630 = "torch.aten.reciprocal"(%34629) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34631 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %34632 = "torch.aten.mul.Scalar"(%34630, %34631) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %34633 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %34634 = "torch.aten.gt.Scalar"(%34632, %34633) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34635 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34636 = "torch.aten.div.Scalar"(%34629, %34635) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34637 = "torch.aten.where.self"(%34634, %34636, %34629) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34638 = "torch.aten.reciprocal"(%34632) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34639 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %34640 = "torch.aten.mul.Scalar"(%34638, %34639) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34641 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34642 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34643 = "torch.aten.sub.Scalar"(%34640, %34641, %34642) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %34644 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34645 = "torch.aten.div.Scalar"(%34643, %34644) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34646 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34647 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34648 = "torch.aten.rsub.Scalar"(%34645, %34646, %34647) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %34649 = "torch.aten.mul.Tensor"(%34648, %34637) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34650 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34651 = "torch.aten.div.Scalar"(%34649, %34650) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34652 = "torch.aten.mul.Tensor"(%34645, %34637) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34653 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34654 = "torch.aten.add.Tensor"(%34651, %34652, %34653) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %34655 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %34656 = "torch.aten.lt.Scalar"(%34632, %34655) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34657 = "torch.aten.bitwise_not"(%34656) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34658 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %34659 = "torch.aten.gt.Scalar"(%34632, %34658) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %34660 = "torch.aten.bitwise_not"(%34659) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34661 = "torch.aten.mul.Tensor"(%34657, %34660) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %34662 = "torch.aten.where.self"(%34661, %34654, %34637) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %34663 = "torch.prim.ListConstruct"(%34662, %34662) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %34664 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34665 = "torch.aten.cat"(%34663, %34664) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %34666 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34667 = "torch.prims.convert_element_type"(%34612, %34666) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %34668 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34669 = "torch.prims.convert_element_type"(%34665, %34668) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %34670 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %34671 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34672 = "torch.prim.ListConstruct"(%34670, %34671) : (!torch.int, !torch.int) -> !torch.list<int>
    %34673 = "torch.aten.view"(%34667, %34672) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %34674 = "torch.aten.mul.Tensor"(%34673, %34669) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34675 = "torch.aten.cos"(%34674) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34676 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34677 = "torch.prims.convert_element_type"(%34675, %34676) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %34678 = "torch.aten.sin"(%34674) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %34679 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34680 = "torch.prims.convert_element_type"(%34678, %34679) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %34681 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34682 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34683 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34684 = "torch.aten.slice.Tensor"(%34677, %34681, %34682, %18481, %34683) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34684, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34685 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34686 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34687 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34688 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34689 = "torch.aten.slice.Tensor"(%34684, %34685, %34686, %34687, %34688) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34689, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34690 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34691 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34692 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34693 = "torch.aten.slice.Tensor"(%34680, %34690, %34691, %18481, %34692) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34693, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34694 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34695 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34696 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34697 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34698 = "torch.aten.slice.Tensor"(%34693, %34694, %34695, %34696, %34697) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%34698, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %34699 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34700 = "torch.aten.unsqueeze"(%34689, %34699) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34700, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34701 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34702 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34703 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34704 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34705 = "torch.aten.slice.Tensor"(%34700, %34701, %34702, %34703, %34704) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34705, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34706 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34707 = "torch.aten.unsqueeze"(%34705, %34706) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34707, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34708 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34709 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34710 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34711 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34712 = "torch.aten.slice.Tensor"(%34707, %34708, %34709, %34710, %34711) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34712, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34713 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34714 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34715 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34716 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34717 = "torch.prim.ListConstruct"(%34713, %34714, %34715, %34716) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34718 = "torch.aten.repeat"(%34712, %34717) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34718, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %34719 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34720 = "torch.aten.unsqueeze"(%34698, %34719) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34720, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34721 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34722 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34723 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34724 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34725 = "torch.aten.slice.Tensor"(%34720, %34721, %34722, %34723, %34724) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%34725, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %34726 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34727 = "torch.aten.unsqueeze"(%34725, %34726) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34727, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34728 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34729 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34730 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34731 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34732 = "torch.aten.slice.Tensor"(%34727, %34728, %34729, %34730, %34731) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34732, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %34733 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34734 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34735 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34736 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34737 = "torch.prim.ListConstruct"(%34733, %34734, %34735, %34736) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34738 = "torch.aten.repeat"(%34732, %34737) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%34738, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %34739 = "torch.aten.mul.Tensor"(%34451, %34718) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34739, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34740 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34741 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34742 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34743 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34744 = "torch.aten.slice.Tensor"(%34451, %34740, %34741, %34742, %34743) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34744, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34745 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %34746 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34747 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %34748 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34749 = "torch.aten.slice.Tensor"(%34451, %34745, %34746, %34747, %34748) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34749, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34750 = "torch.aten.neg"(%34749) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34750, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %34751 = "torch.prim.ListConstruct"(%34750, %34744) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %34752 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34753 = "torch.aten.cat"(%34751, %34752) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34753, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34754 = "torch.aten.mul.Tensor"(%34753, %34738) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34754, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34755 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34756 = "torch.aten.add.Tensor"(%34739, %34754, %34755) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34756, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34757 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %34758 = "torch.aten.mul.Scalar"(%arg69, %34757) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%34758, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %34759 = "torch.constant.int"() <{value = 48 : i64}> : () -> !torch.int
    %34760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34761 = "torch.aten.add.Scalar"(%34758, %34759, %34760) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%34761, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %34762 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34763 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34764 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34765 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34766 = "torch.prim.ListConstruct"(%34762, %18477, %34763, %34764, %34765) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34767 = "torch.aten.view"(%34756, %34766) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34767, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34768 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34769 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34770 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34771 = "torch.prim.ListConstruct"(%19011, %34768, %34769, %34770) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34772 = "torch.aten.view"(%34767, %34771) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34772, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34773 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %34774 = "torch.aten.view"(%34761, %34773) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%34774, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %34775 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34776 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34777 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34778 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34779 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34780 = "torch.prim.ListConstruct"(%18479, %34775, %34776, %34777, %34778, %34779) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34781 = "torch.aten.view"(%34183, %34780) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34781, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34782 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34783 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34784 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34785 = "torch.prim.ListConstruct"(%18993, %34782, %34783, %34784) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34786 = "torch.aten.view"(%34781, %34785) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34786, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34787 = "torch.prim.ListConstruct"(%34774) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %34788 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34789 = "torch.aten.index_put"(%34786, %34787, %34772, %34788) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34789, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34790 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34791 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34792 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34793 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34794 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34795 = "torch.prim.ListConstruct"(%18479, %34790, %34791, %34792, %34793, %34794) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34796 = "torch.aten.view"(%34789, %34795) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34796, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34797 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %34798 = "torch.prim.ListConstruct"(%18479, %34797) : (!torch.int, !torch.int) -> !torch.list<int>
    %34799 = "torch.aten.view"(%34796, %34798) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34799, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %34800 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34801 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34802 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34803 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34804 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34805 = "torch.prim.ListConstruct"(%18479, %34800, %34801, %34802, %34803, %34804) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34806 = "torch.aten.view"(%34799, %34805) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34806, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34807 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34808 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34809 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34810 = "torch.prim.ListConstruct"(%18993, %34807, %34808, %34809) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34811 = "torch.aten.view"(%34806, %34810) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34811, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34812 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34813 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34814 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34815 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34816 = "torch.prim.ListConstruct"(%34812, %18477, %34813, %34814, %34815) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34817 = "torch.aten.view"(%34456, %34816) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34817, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34818 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34819 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34820 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34821 = "torch.prim.ListConstruct"(%19011, %34818, %34819, %34820) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34822 = "torch.aten.view"(%34817, %34821) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34822, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34823 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34825 = "torch.aten.add.Scalar"(%34761, %34823, %34824) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%34825, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %34826 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %34827 = "torch.aten.view"(%34825, %34826) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%34827, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %34828 = "torch.prim.ListConstruct"(%34827) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %34829 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34830 = "torch.aten.index_put"(%34811, %34828, %34822, %34829) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34830, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34831 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34832 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34833 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34834 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34835 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34836 = "torch.prim.ListConstruct"(%18479, %34831, %34832, %34833, %34834, %34835) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34837 = "torch.aten.view"(%34830, %34836) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34837, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34838 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %34839 = "torch.prim.ListConstruct"(%18479, %34838) : (!torch.int, !torch.int) -> !torch.list<int>
    %34840 = "torch.aten.view"(%34837, %34839) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34840, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %34841 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %34842 = "torch.aten.unsqueeze"(%34756, %34841) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34842, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34843 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34844 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34845 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34846 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34847 = "torch.prim.ListConstruct"(%34843, %18481, %34844, %34845, %34846) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34848 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34849 = "torch.aten.expand"(%34842, %34847, %34848) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34849, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34850 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34851 = "torch.aten.clone"(%34849, %34850) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34851, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34852 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34853 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34854 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34855 = "torch.prim.ListConstruct"(%34852, %18481, %34853, %34854) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34856 = "torch.aten._unsafe_view"(%34851, %34855) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34856, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34857 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %34858 = "torch.aten.unsqueeze"(%34456, %34857) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34858, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34859 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34860 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %34861 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34862 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34863 = "torch.prim.ListConstruct"(%34859, %18481, %34860, %34861, %34862) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34864 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34865 = "torch.aten.expand"(%34858, %34863, %34864) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34865, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34866 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34867 = "torch.aten.clone"(%34865, %34866) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34867, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34868 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34869 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %34870 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %34871 = "torch.prim.ListConstruct"(%34868, %18481, %34869, %34870) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34872 = "torch.aten._unsafe_view"(%34867, %34871) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34872, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34873 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34874 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34875 = "torch.aten.transpose.int"(%34606, %34873, %34874) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34875, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34876 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34877 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34878 = "torch.aten.transpose.int"(%34856, %34876, %34877) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34878, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34880 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34881 = "torch.aten.transpose.int"(%34872, %34879, %34880) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34881, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %34882 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34883 = "torch.aten.squeeze.dim"(%18570, %34882) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34883, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %34884 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34885 = "torch.aten.squeeze.dim"(%34883, %34884) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34885, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %34886 = "torch_c.to_builtin_tensor"(%34875) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %34887 = "torch_c.to_builtin_tensor"(%34878) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %34888 = "torch_c.to_builtin_tensor"(%34881) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %34889 = "torch_c.to_builtin_tensor"(%34885) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %34890 = "tensor.cast"(%34889) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %34891 = "torch_c.to_builtin_tensor"(%18169) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %34892 = "util.call"(%34886, %34887, %34888, %34891, %34890) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %34893 = "torch_c.from_builtin_tensor"(%34892) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%34893, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %34894 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34895 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34896 = "torch.aten.transpose.int"(%34893, %34894, %34895) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%34896, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %34897 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34898 = "torch.aten.clone"(%34896, %34897) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%34898, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %34899 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34900 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34901 = "torch.prim.ListConstruct"(%34899, %18481, %34900) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34902 = "torch.aten._unsafe_view"(%34898, %34901) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34902, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34903 = "torch.aten.div.Tensor"(%34902, %18171) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34903, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34904 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34905 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34906 = "torch.aten.clamp"(%34903, %34904, %34905) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34906, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34907 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34908 = "torch.prims.convert_element_type"(%34906, %34907) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34908, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34909 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34910 = "torch.aten.unsqueeze"(%18173, %34909) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %34911 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34912 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34913 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34914 = "torch.prim.ListConstruct"(%34911, %34912, %34913) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34915 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34916 = "torch.aten.expand"(%34910, %34914, %34915) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %34917 = "torch_c.to_builtin_tensor"(%34908) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34918 = "torch_c.to_builtin_tensor"(%34916) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %34919 = "util.call"(%34917, %34918) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %34920 = "torch_c.from_builtin_tensor"(%34919) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34920, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34921 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34922 = "torch.prims.convert_element_type"(%34920, %34921) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34922, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34923 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34924 = "torch.aten.add.Tensor"(%34350, %34922, %34923) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34924, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34925 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %34926 = "torch.prims.convert_element_type"(%34924, %34925) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34926, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34927 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %34928 = "torch.aten.pow.Tensor_Scalar"(%34926, %34927) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34928, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34929 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %34930 = "torch.prim.ListConstruct"(%34929) : (!torch.int) -> !torch.list<int>
    %34931 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %34932 = "torch.constant.none"() : () -> !torch.none
    %34933 = "torch.aten.mean.dim"(%34928, %34930, %34931, %34932) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34933, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34934 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %34935 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %34936 = "torch.aten.add.Scalar"(%34933, %34934, %34935) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34936, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34937 = "torch.aten.rsqrt"(%34936) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%34937, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %34938 = "torch.aten.mul.Tensor"(%34926, %34937) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%34938, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %34939 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34940 = "torch.prims.convert_element_type"(%34938, %34939) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34940, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34941 = "torch.aten.mul.Tensor"(%18175, %34940) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34941, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34942 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34943 = "torch.prims.convert_element_type"(%34941, %34942) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34943, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34944 = "torch.aten.div.Tensor"(%34943, %18177) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34944, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34945 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34946 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34947 = "torch.aten.clamp"(%34944, %34945, %34946) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34947, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34948 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34949 = "torch.prims.convert_element_type"(%34947, %34948) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34949, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34950 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34951 = "torch.aten.unsqueeze"(%18179, %34950) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %34952 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34953 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %34954 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34955 = "torch.prim.ListConstruct"(%34952, %34953, %34954) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34956 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34957 = "torch.aten.expand"(%34951, %34955, %34956) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %34958 = "torch_c.to_builtin_tensor"(%34949) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34959 = "torch_c.to_builtin_tensor"(%34957) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %34960 = "util.call"(%34958, %34959) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %34961 = "torch_c.from_builtin_tensor"(%34960) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%34961, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %34962 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34963 = "torch.prims.convert_element_type"(%34961, %34962) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34963, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34964 = "torch.aten.silu"(%34963) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34964, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34965 = "torch.aten.div.Tensor"(%34943, %18181) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34965, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34966 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34967 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34968 = "torch.aten.clamp"(%34965, %34966, %34967) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%34968, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %34969 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34970 = "torch.prims.convert_element_type"(%34968, %34969) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34970, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %34971 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34972 = "torch.aten.unsqueeze"(%18183, %34971) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %34973 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34974 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %34975 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34976 = "torch.prim.ListConstruct"(%34973, %34974, %34975) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34977 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34978 = "torch.aten.expand"(%34972, %34976, %34977) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %34979 = "torch_c.to_builtin_tensor"(%34970) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %34980 = "torch_c.to_builtin_tensor"(%34978) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %34981 = "util.call"(%34979, %34980) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %34982 = "torch_c.from_builtin_tensor"(%34981) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%34982, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %34983 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %34984 = "torch.prims.convert_element_type"(%34982, %34983) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34984, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34985 = "torch.aten.mul.Tensor"(%34964, %34984) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34985, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34986 = "torch.aten.div.Tensor"(%34985, %18185) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34986, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34987 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %34988 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %34989 = "torch.aten.clamp"(%34986, %34987, %34988) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%34989, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %34990 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %34991 = "torch.prims.convert_element_type"(%34989, %34990) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%34991, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %34992 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %34993 = "torch.aten.unsqueeze"(%18187, %34992) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %34994 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %34995 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %34996 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %34997 = "torch.prim.ListConstruct"(%34994, %34995, %34996) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %34998 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %34999 = "torch.aten.expand"(%34993, %34997, %34998) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %35000 = "torch_c.to_builtin_tensor"(%34991) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %35001 = "torch_c.to_builtin_tensor"(%34999) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %35002 = "util.call"(%35000, %35001) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %35003 = "torch_c.from_builtin_tensor"(%35002) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35003, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35004 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35005 = "torch.prims.convert_element_type"(%35003, %35004) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35005, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35006 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35007 = "torch.aten.add.Tensor"(%34924, %35005, %35006) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35007, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35008 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35009 = "torch.prims.convert_element_type"(%35007, %35008) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35009, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35010 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35011 = "torch.aten.pow.Tensor_Scalar"(%35009, %35010) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35011, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35012 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35013 = "torch.prim.ListConstruct"(%35012) : (!torch.int) -> !torch.list<int>
    %35014 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %35015 = "torch.constant.none"() : () -> !torch.none
    %35016 = "torch.aten.mean.dim"(%35011, %35013, %35014, %35015) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35016, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35017 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %35018 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35019 = "torch.aten.add.Scalar"(%35016, %35017, %35018) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35019, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35020 = "torch.aten.rsqrt"(%35019) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35020, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35021 = "torch.aten.mul.Tensor"(%35009, %35020) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35021, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35022 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35023 = "torch.prims.convert_element_type"(%35021, %35022) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35023, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35024 = "torch.aten.mul.Tensor"(%18189, %35023) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35024, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35025 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35026 = "torch.prims.convert_element_type"(%35024, %35025) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35026, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35027 = "torch.aten.div.Tensor"(%35026, %18191) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35027, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35028 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35029 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35030 = "torch.aten.clamp"(%35027, %35028, %35029) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35030, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35031 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35032 = "torch.prims.convert_element_type"(%35030, %35031) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35032, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35033 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35034 = "torch.aten.unsqueeze"(%18193, %35033) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %35035 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35036 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35037 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35038 = "torch.prim.ListConstruct"(%35035, %35036, %35037) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35039 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35040 = "torch.aten.expand"(%35034, %35038, %35039) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %35041 = "torch_c.to_builtin_tensor"(%35032) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35042 = "torch_c.to_builtin_tensor"(%35040) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %35043 = "util.call"(%35041, %35042) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %35044 = "torch_c.from_builtin_tensor"(%35043) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35044, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35045 = "torch.aten.div.Tensor"(%35044, %18195) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35045, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35046 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35047 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35048 = "torch.aten.clamp"(%35045, %35046, %35047) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35048, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35049 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35050 = "torch.prims.convert_element_type"(%35048, %35049) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35050, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35051 = "torch.aten.div.Tensor"(%35026, %18197) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35051, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35052 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35053 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35054 = "torch.aten.clamp"(%35051, %35052, %35053) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35054, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35055 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35056 = "torch.prims.convert_element_type"(%35054, %35055) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35056, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35057 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35058 = "torch.aten.unsqueeze"(%18199, %35057) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %35059 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35060 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %35061 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35062 = "torch.prim.ListConstruct"(%35059, %35060, %35061) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35063 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35064 = "torch.aten.expand"(%35058, %35062, %35063) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %35065 = "torch_c.to_builtin_tensor"(%35056) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35066 = "torch_c.to_builtin_tensor"(%35064) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %35067 = "util.call"(%35065, %35066) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %35068 = "torch_c.from_builtin_tensor"(%35067) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35068, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35069 = "torch.aten.div.Tensor"(%35068, %18201) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35069, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35070 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35071 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35072 = "torch.aten.clamp"(%35069, %35070, %35071) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35072, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35073 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35074 = "torch.prims.convert_element_type"(%35072, %35073) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35074, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %35075 = "torch.aten.div.Tensor"(%35026, %18203) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35075, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35076 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35077 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35078 = "torch.aten.clamp"(%35075, %35076, %35077) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35078, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35079 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35080 = "torch.prims.convert_element_type"(%35078, %35079) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35080, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35081 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35082 = "torch.aten.unsqueeze"(%18205, %35081) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %35083 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35084 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %35085 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35086 = "torch.prim.ListConstruct"(%35083, %35084, %35085) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35087 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35088 = "torch.aten.expand"(%35082, %35086, %35087) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %35089 = "torch_c.to_builtin_tensor"(%35080) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35090 = "torch_c.to_builtin_tensor"(%35088) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %35091 = "util.call"(%35089, %35090) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %35092 = "torch_c.from_builtin_tensor"(%35091) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35092, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35093 = "torch.aten.div.Tensor"(%35092, %18207) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35093, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35094 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35095 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35096 = "torch.aten.clamp"(%35093, %35094, %35095) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35096, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35097 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35098 = "torch.prims.convert_element_type"(%35096, %35097) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35098, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %35099 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35100 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35101 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35102 = "torch.prim.ListConstruct"(%35099, %18481, %35100, %35101) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35103 = "torch.aten.view"(%35050, %35102) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35103, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35104 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35105 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35106 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35107 = "torch.prim.ListConstruct"(%35104, %18481, %35105, %35106) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35108 = "torch.aten.view"(%35074, %35107) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35108, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35109 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35110 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35111 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35112 = "torch.prim.ListConstruct"(%35109, %18481, %35110, %35111) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35113 = "torch.aten.view"(%35098, %35112) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35113, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35114 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35115 = "torch.constant.none"() : () -> !torch.none
    %35116 = "torch.constant.none"() : () -> !torch.none
    %35117 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35118 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35119 = "torch.aten.arange"(%35114, %35115, %35116, %35117, %35118) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %35120 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35121 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35122 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35123 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35124 = "torch.constant.none"() : () -> !torch.none
    %35125 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35126 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35127 = "torch.aten.arange.start_step"(%35120, %35121, %35122, %35123, %35124, %35125, %35126) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %35128 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35129 = "torch.prims.convert_element_type"(%35127, %35128) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %35130 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35131 = "torch.aten.div.Scalar"(%35129, %35130) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35132 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %35133 = "torch.aten.pow.Scalar"(%35132, %35131) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35134 = "torch.aten.reciprocal"(%35133) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35135 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %35136 = "torch.aten.mul.Scalar"(%35134, %35135) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35137 = "torch.aten.reciprocal"(%35136) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35138 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %35139 = "torch.aten.mul.Scalar"(%35137, %35138) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35140 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35141 = "torch.aten.gt.Scalar"(%35139, %35140) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35142 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35143 = "torch.aten.div.Scalar"(%35136, %35142) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35144 = "torch.aten.where.self"(%35141, %35143, %35136) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35145 = "torch.aten.reciprocal"(%35139) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35146 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %35147 = "torch.aten.mul.Scalar"(%35145, %35146) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35148 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35149 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35150 = "torch.aten.sub.Scalar"(%35147, %35148, %35149) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35151 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35152 = "torch.aten.div.Scalar"(%35150, %35151) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35153 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35154 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35155 = "torch.aten.rsub.Scalar"(%35152, %35153, %35154) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35156 = "torch.aten.mul.Tensor"(%35155, %35144) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35157 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35158 = "torch.aten.div.Scalar"(%35156, %35157) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35159 = "torch.aten.mul.Tensor"(%35152, %35144) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35160 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35161 = "torch.aten.add.Tensor"(%35158, %35159, %35160) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35162 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %35163 = "torch.aten.lt.Scalar"(%35139, %35162) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35164 = "torch.aten.bitwise_not"(%35163) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35165 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35166 = "torch.aten.gt.Scalar"(%35139, %35165) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35167 = "torch.aten.bitwise_not"(%35166) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35168 = "torch.aten.mul.Tensor"(%35164, %35167) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35169 = "torch.aten.where.self"(%35168, %35161, %35144) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35170 = "torch.prim.ListConstruct"(%35169, %35169) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %35171 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35172 = "torch.aten.cat"(%35170, %35171) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %35173 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35174 = "torch.prims.convert_element_type"(%35119, %35173) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %35175 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35176 = "torch.prims.convert_element_type"(%35172, %35175) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %35177 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35178 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35179 = "torch.prim.ListConstruct"(%35177, %35178) : (!torch.int, !torch.int) -> !torch.list<int>
    %35180 = "torch.aten.view"(%35174, %35179) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %35181 = "torch.aten.mul.Tensor"(%35180, %35176) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35182 = "torch.aten.cos"(%35181) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35183 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35184 = "torch.prims.convert_element_type"(%35182, %35183) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35185 = "torch.aten.sin"(%35181) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35186 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35187 = "torch.prims.convert_element_type"(%35185, %35186) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35188 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35189 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35190 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35191 = "torch.aten.slice.Tensor"(%35184, %35188, %35189, %18481, %35190) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35191, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35193 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35194 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35195 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35196 = "torch.aten.slice.Tensor"(%35191, %35192, %35193, %35194, %35195) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35196, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35197 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35198 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35199 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35200 = "torch.aten.slice.Tensor"(%35187, %35197, %35198, %18481, %35199) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35200, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35201 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35202 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35203 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35204 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35205 = "torch.aten.slice.Tensor"(%35200, %35201, %35202, %35203, %35204) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35205, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35206 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35207 = "torch.aten.unsqueeze"(%35196, %35206) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35207, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35208 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35209 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35210 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35211 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35212 = "torch.aten.slice.Tensor"(%35207, %35208, %35209, %35210, %35211) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35212, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35213 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35214 = "torch.aten.unsqueeze"(%35212, %35213) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35214, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35215 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35216 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35217 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35218 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35219 = "torch.aten.slice.Tensor"(%35214, %35215, %35216, %35217, %35218) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35219, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35220 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35222 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35223 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35224 = "torch.prim.ListConstruct"(%35220, %35221, %35222, %35223) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35225 = "torch.aten.repeat"(%35219, %35224) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35225, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %35226 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35227 = "torch.aten.unsqueeze"(%35205, %35226) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35227, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35228 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35229 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35230 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35232 = "torch.aten.slice.Tensor"(%35227, %35228, %35229, %35230, %35231) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35232, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35233 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35234 = "torch.aten.unsqueeze"(%35232, %35233) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35234, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35235 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35237 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35239 = "torch.aten.slice.Tensor"(%35234, %35235, %35236, %35237, %35238) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35239, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35240 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35241 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35242 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35243 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35244 = "torch.prim.ListConstruct"(%35240, %35241, %35242, %35243) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35245 = "torch.aten.repeat"(%35239, %35244) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35245, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %35246 = "torch.aten.mul.Tensor"(%35103, %35225) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35246, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35247 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35248 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35249 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %35250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35251 = "torch.aten.slice.Tensor"(%35103, %35247, %35248, %35249, %35250) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35251, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35252 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35253 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %35254 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35255 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35256 = "torch.aten.slice.Tensor"(%35103, %35252, %35253, %35254, %35255) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35256, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35257 = "torch.aten.neg"(%35256) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35257, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35258 = "torch.prim.ListConstruct"(%35257, %35251) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %35259 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35260 = "torch.aten.cat"(%35258, %35259) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35260, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35261 = "torch.aten.mul.Tensor"(%35260, %35245) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35261, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35262 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35263 = "torch.aten.add.Tensor"(%35246, %35261, %35262) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35263, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35264 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35265 = "torch.constant.none"() : () -> !torch.none
    %35266 = "torch.constant.none"() : () -> !torch.none
    %35267 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35268 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35269 = "torch.aten.arange"(%35264, %35265, %35266, %35267, %35268) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %35270 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35271 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35272 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35273 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35274 = "torch.constant.none"() : () -> !torch.none
    %35275 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35276 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35277 = "torch.aten.arange.start_step"(%35270, %35271, %35272, %35273, %35274, %35275, %35276) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %35278 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35279 = "torch.prims.convert_element_type"(%35277, %35278) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %35280 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35281 = "torch.aten.div.Scalar"(%35279, %35280) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35282 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %35283 = "torch.aten.pow.Scalar"(%35282, %35281) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35284 = "torch.aten.reciprocal"(%35283) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35285 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %35286 = "torch.aten.mul.Scalar"(%35284, %35285) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35287 = "torch.aten.reciprocal"(%35286) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35288 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %35289 = "torch.aten.mul.Scalar"(%35287, %35288) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35290 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35291 = "torch.aten.gt.Scalar"(%35289, %35290) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35292 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35293 = "torch.aten.div.Scalar"(%35286, %35292) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35294 = "torch.aten.where.self"(%35291, %35293, %35286) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35295 = "torch.aten.reciprocal"(%35289) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35296 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %35297 = "torch.aten.mul.Scalar"(%35295, %35296) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35298 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35299 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35300 = "torch.aten.sub.Scalar"(%35297, %35298, %35299) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35301 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35302 = "torch.aten.div.Scalar"(%35300, %35301) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35303 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35304 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35305 = "torch.aten.rsub.Scalar"(%35302, %35303, %35304) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35306 = "torch.aten.mul.Tensor"(%35305, %35294) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35307 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35308 = "torch.aten.div.Scalar"(%35306, %35307) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35309 = "torch.aten.mul.Tensor"(%35302, %35294) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35310 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35311 = "torch.aten.add.Tensor"(%35308, %35309, %35310) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35312 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %35313 = "torch.aten.lt.Scalar"(%35289, %35312) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35314 = "torch.aten.bitwise_not"(%35313) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35315 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35316 = "torch.aten.gt.Scalar"(%35289, %35315) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35317 = "torch.aten.bitwise_not"(%35316) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35318 = "torch.aten.mul.Tensor"(%35314, %35317) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35319 = "torch.aten.where.self"(%35318, %35311, %35294) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35320 = "torch.prim.ListConstruct"(%35319, %35319) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %35321 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35322 = "torch.aten.cat"(%35320, %35321) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %35323 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35324 = "torch.prims.convert_element_type"(%35269, %35323) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %35325 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35326 = "torch.prims.convert_element_type"(%35322, %35325) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %35327 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35328 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35329 = "torch.prim.ListConstruct"(%35327, %35328) : (!torch.int, !torch.int) -> !torch.list<int>
    %35330 = "torch.aten.view"(%35324, %35329) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %35331 = "torch.aten.mul.Tensor"(%35330, %35326) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35332 = "torch.aten.cos"(%35331) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35333 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35334 = "torch.prims.convert_element_type"(%35332, %35333) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35335 = "torch.aten.sin"(%35331) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35336 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35337 = "torch.prims.convert_element_type"(%35335, %35336) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35338 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35339 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35340 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35341 = "torch.aten.slice.Tensor"(%35334, %35338, %35339, %18481, %35340) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35341, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35342 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35343 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35344 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35345 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35346 = "torch.aten.slice.Tensor"(%35341, %35342, %35343, %35344, %35345) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35346, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35347 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35348 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35349 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35350 = "torch.aten.slice.Tensor"(%35337, %35347, %35348, %18481, %35349) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35350, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35351 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35352 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35353 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35354 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35355 = "torch.aten.slice.Tensor"(%35350, %35351, %35352, %35353, %35354) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35355, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35356 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35357 = "torch.aten.unsqueeze"(%35346, %35356) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35357, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35358 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35359 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35360 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35361 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35362 = "torch.aten.slice.Tensor"(%35357, %35358, %35359, %35360, %35361) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35362, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35363 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35364 = "torch.aten.unsqueeze"(%35362, %35363) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35364, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35365 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35366 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35367 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35368 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35369 = "torch.aten.slice.Tensor"(%35364, %35365, %35366, %35367, %35368) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35369, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35370 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35371 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35372 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35373 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35374 = "torch.prim.ListConstruct"(%35370, %35371, %35372, %35373) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35375 = "torch.aten.repeat"(%35369, %35374) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35375, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %35376 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35377 = "torch.aten.unsqueeze"(%35355, %35376) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35377, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35378 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35379 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35380 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35381 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35382 = "torch.aten.slice.Tensor"(%35377, %35378, %35379, %35380, %35381) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35382, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35383 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35384 = "torch.aten.unsqueeze"(%35382, %35383) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35384, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35385 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35386 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35387 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35388 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35389 = "torch.aten.slice.Tensor"(%35384, %35385, %35386, %35387, %35388) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35389, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35390 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35391 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35392 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35393 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35394 = "torch.prim.ListConstruct"(%35390, %35391, %35392, %35393) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35395 = "torch.aten.repeat"(%35389, %35394) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35395, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %35396 = "torch.aten.mul.Tensor"(%35108, %35375) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35396, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35397 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35398 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35399 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %35400 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35401 = "torch.aten.slice.Tensor"(%35108, %35397, %35398, %35399, %35400) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35401, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35402 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35403 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %35404 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35405 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35406 = "torch.aten.slice.Tensor"(%35108, %35402, %35403, %35404, %35405) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35406, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35407 = "torch.aten.neg"(%35406) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35407, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35408 = "torch.prim.ListConstruct"(%35407, %35401) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %35409 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35410 = "torch.aten.cat"(%35408, %35409) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35410, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35411 = "torch.aten.mul.Tensor"(%35410, %35395) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35411, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35412 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35413 = "torch.aten.add.Tensor"(%35396, %35411, %35412) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35413, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35414 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %35415 = "torch.aten.mul.Scalar"(%arg69, %35414) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%35415, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %35416 = "torch.constant.int"() <{value = 50 : i64}> : () -> !torch.int
    %35417 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35418 = "torch.aten.add.Scalar"(%35415, %35416, %35417) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%35418, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %35419 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35420 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35421 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35422 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35423 = "torch.prim.ListConstruct"(%35419, %18477, %35420, %35421, %35422) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35424 = "torch.aten.view"(%35413, %35423) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35424, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35425 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35426 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35427 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35428 = "torch.prim.ListConstruct"(%19011, %35425, %35426, %35427) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35429 = "torch.aten.view"(%35424, %35428) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35429, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35430 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %35431 = "torch.aten.view"(%35418, %35430) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%35431, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %35432 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35433 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35434 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35435 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35436 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35437 = "torch.prim.ListConstruct"(%18479, %35432, %35433, %35434, %35435, %35436) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35438 = "torch.aten.view"(%34840, %35437) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35438, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35439 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35440 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35441 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35442 = "torch.prim.ListConstruct"(%18993, %35439, %35440, %35441) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35443 = "torch.aten.view"(%35438, %35442) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35443, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35444 = "torch.prim.ListConstruct"(%35431) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %35445 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35446 = "torch.aten.index_put"(%35443, %35444, %35429, %35445) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35446, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35447 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35448 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35449 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35450 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35451 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35452 = "torch.prim.ListConstruct"(%18479, %35447, %35448, %35449, %35450, %35451) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35453 = "torch.aten.view"(%35446, %35452) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35453, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35454 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %35455 = "torch.prim.ListConstruct"(%18479, %35454) : (!torch.int, !torch.int) -> !torch.list<int>
    %35456 = "torch.aten.view"(%35453, %35455) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35456, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %35457 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35458 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35459 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35460 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35461 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35462 = "torch.prim.ListConstruct"(%18479, %35457, %35458, %35459, %35460, %35461) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35463 = "torch.aten.view"(%35456, %35462) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35463, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35464 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35465 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35466 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35467 = "torch.prim.ListConstruct"(%18993, %35464, %35465, %35466) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35468 = "torch.aten.view"(%35463, %35467) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35468, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35469 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35470 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35471 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35472 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35473 = "torch.prim.ListConstruct"(%35469, %18477, %35470, %35471, %35472) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35474 = "torch.aten.view"(%35113, %35473) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35474, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35475 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35476 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35477 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35478 = "torch.prim.ListConstruct"(%19011, %35475, %35476, %35477) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35479 = "torch.aten.view"(%35474, %35478) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35479, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35480 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35481 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35482 = "torch.aten.add.Scalar"(%35418, %35480, %35481) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%35482, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %35483 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %35484 = "torch.aten.view"(%35482, %35483) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%35484, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %35485 = "torch.prim.ListConstruct"(%35484) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %35486 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35487 = "torch.aten.index_put"(%35468, %35485, %35479, %35486) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35487, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35488 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35489 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35490 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35491 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35492 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35493 = "torch.prim.ListConstruct"(%18479, %35488, %35489, %35490, %35491, %35492) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35494 = "torch.aten.view"(%35487, %35493) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35494, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35495 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %35496 = "torch.prim.ListConstruct"(%18479, %35495) : (!torch.int, !torch.int) -> !torch.list<int>
    %35497 = "torch.aten.view"(%35494, %35496) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35497, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %35498 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %35499 = "torch.aten.unsqueeze"(%35413, %35498) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35499, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35500 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35501 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35502 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35503 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35504 = "torch.prim.ListConstruct"(%35500, %18481, %35501, %35502, %35503) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35505 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35506 = "torch.aten.expand"(%35499, %35504, %35505) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35506, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35507 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35508 = "torch.aten.clone"(%35506, %35507) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35508, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35509 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35510 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35511 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35512 = "torch.prim.ListConstruct"(%35509, %18481, %35510, %35511) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35513 = "torch.aten._unsafe_view"(%35508, %35512) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35513, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35514 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %35515 = "torch.aten.unsqueeze"(%35113, %35514) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35515, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35516 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35517 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35518 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35519 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35520 = "torch.prim.ListConstruct"(%35516, %18481, %35517, %35518, %35519) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35521 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35522 = "torch.aten.expand"(%35515, %35520, %35521) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35522, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35523 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35524 = "torch.aten.clone"(%35522, %35523) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35524, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35525 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35526 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35527 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35528 = "torch.prim.ListConstruct"(%35525, %18481, %35526, %35527) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35529 = "torch.aten._unsafe_view"(%35524, %35528) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35529, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35530 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35531 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35532 = "torch.aten.transpose.int"(%35263, %35530, %35531) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35532, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35534 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35535 = "torch.aten.transpose.int"(%35513, %35533, %35534) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35535, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35537 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35538 = "torch.aten.transpose.int"(%35529, %35536, %35537) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35538, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35539 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35540 = "torch.aten.squeeze.dim"(%18570, %35539) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35540, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %35541 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35542 = "torch.aten.squeeze.dim"(%35540, %35541) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35542, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %35543 = "torch_c.to_builtin_tensor"(%35532) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %35544 = "torch_c.to_builtin_tensor"(%35535) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %35545 = "torch_c.to_builtin_tensor"(%35538) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %35546 = "torch_c.to_builtin_tensor"(%35542) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %35547 = "tensor.cast"(%35546) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %35548 = "torch_c.to_builtin_tensor"(%18209) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %35549 = "util.call"(%35543, %35544, %35545, %35548, %35547) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %35550 = "torch_c.from_builtin_tensor"(%35549) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%35550, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %35551 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35552 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35553 = "torch.aten.transpose.int"(%35550, %35551, %35552) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%35553, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %35554 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35555 = "torch.aten.clone"(%35553, %35554) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%35555, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %35556 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35557 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35558 = "torch.prim.ListConstruct"(%35556, %18481, %35557) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35559 = "torch.aten._unsafe_view"(%35555, %35558) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35559, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35560 = "torch.aten.div.Tensor"(%35559, %18211) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35560, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35561 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35562 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35563 = "torch.aten.clamp"(%35560, %35561, %35562) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35563, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35564 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35565 = "torch.prims.convert_element_type"(%35563, %35564) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35565, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35566 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35567 = "torch.aten.unsqueeze"(%18213, %35566) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %35568 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35569 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35570 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35571 = "torch.prim.ListConstruct"(%35568, %35569, %35570) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35572 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35573 = "torch.aten.expand"(%35567, %35571, %35572) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %35574 = "torch_c.to_builtin_tensor"(%35565) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35575 = "torch_c.to_builtin_tensor"(%35573) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %35576 = "util.call"(%35574, %35575) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %35577 = "torch_c.from_builtin_tensor"(%35576) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35577, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35578 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35579 = "torch.prims.convert_element_type"(%35577, %35578) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35579, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35580 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35581 = "torch.aten.add.Tensor"(%35007, %35579, %35580) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35581, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35582 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35583 = "torch.prims.convert_element_type"(%35581, %35582) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35583, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35584 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35585 = "torch.aten.pow.Tensor_Scalar"(%35583, %35584) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35585, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35586 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35587 = "torch.prim.ListConstruct"(%35586) : (!torch.int) -> !torch.list<int>
    %35588 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %35589 = "torch.constant.none"() : () -> !torch.none
    %35590 = "torch.aten.mean.dim"(%35585, %35587, %35588, %35589) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35590, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35591 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %35592 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35593 = "torch.aten.add.Scalar"(%35590, %35591, %35592) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35593, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35594 = "torch.aten.rsqrt"(%35593) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35594, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35595 = "torch.aten.mul.Tensor"(%35583, %35594) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35595, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35596 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35597 = "torch.prims.convert_element_type"(%35595, %35596) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35597, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35598 = "torch.aten.mul.Tensor"(%18215, %35597) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35598, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35599 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35600 = "torch.prims.convert_element_type"(%35598, %35599) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35600, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35601 = "torch.aten.div.Tensor"(%35600, %18217) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35601, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35602 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35603 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35604 = "torch.aten.clamp"(%35601, %35602, %35603) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35604, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35605 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35606 = "torch.prims.convert_element_type"(%35604, %35605) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35606, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35607 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35608 = "torch.aten.unsqueeze"(%18219, %35607) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %35609 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35610 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %35611 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35612 = "torch.prim.ListConstruct"(%35609, %35610, %35611) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35613 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35614 = "torch.aten.expand"(%35608, %35612, %35613) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %35615 = "torch_c.to_builtin_tensor"(%35606) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35616 = "torch_c.to_builtin_tensor"(%35614) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %35617 = "util.call"(%35615, %35616) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %35618 = "torch_c.from_builtin_tensor"(%35617) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%35618, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %35619 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35620 = "torch.prims.convert_element_type"(%35618, %35619) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%35620, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %35621 = "torch.aten.silu"(%35620) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%35621, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %35622 = "torch.aten.div.Tensor"(%35600, %18221) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35622, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35623 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35624 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35625 = "torch.aten.clamp"(%35622, %35623, %35624) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35625, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35626 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35627 = "torch.prims.convert_element_type"(%35625, %35626) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35627, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35628 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35629 = "torch.aten.unsqueeze"(%18223, %35628) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %35630 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35631 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %35632 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35633 = "torch.prim.ListConstruct"(%35630, %35631, %35632) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35634 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35635 = "torch.aten.expand"(%35629, %35633, %35634) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %35636 = "torch_c.to_builtin_tensor"(%35627) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35637 = "torch_c.to_builtin_tensor"(%35635) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %35638 = "util.call"(%35636, %35637) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %35639 = "torch_c.from_builtin_tensor"(%35638) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%35639, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %35640 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35641 = "torch.prims.convert_element_type"(%35639, %35640) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%35641, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %35642 = "torch.aten.mul.Tensor"(%35621, %35641) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%35642, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %35643 = "torch.aten.div.Tensor"(%35642, %18225) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%35643, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %35644 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35645 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35646 = "torch.aten.clamp"(%35643, %35644, %35645) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%35646, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %35647 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35648 = "torch.prims.convert_element_type"(%35646, %35647) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35648, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %35649 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35650 = "torch.aten.unsqueeze"(%18227, %35649) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %35651 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35652 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35653 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %35654 = "torch.prim.ListConstruct"(%35651, %35652, %35653) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35655 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35656 = "torch.aten.expand"(%35650, %35654, %35655) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %35657 = "torch_c.to_builtin_tensor"(%35648) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %35658 = "torch_c.to_builtin_tensor"(%35656) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %35659 = "util.call"(%35657, %35658) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %35660 = "torch_c.from_builtin_tensor"(%35659) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35660, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35661 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35662 = "torch.prims.convert_element_type"(%35660, %35661) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35662, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35663 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35664 = "torch.aten.add.Tensor"(%35581, %35662, %35663) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35664, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35665 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35666 = "torch.prims.convert_element_type"(%35664, %35665) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35666, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35667 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35668 = "torch.aten.pow.Tensor_Scalar"(%35666, %35667) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35668, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35669 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35670 = "torch.prim.ListConstruct"(%35669) : (!torch.int) -> !torch.list<int>
    %35671 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %35672 = "torch.constant.none"() : () -> !torch.none
    %35673 = "torch.aten.mean.dim"(%35668, %35670, %35671, %35672) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35673, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35674 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %35675 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35676 = "torch.aten.add.Scalar"(%35673, %35674, %35675) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35676, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35677 = "torch.aten.rsqrt"(%35676) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%35677, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %35678 = "torch.aten.mul.Tensor"(%35666, %35677) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35678, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35679 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35680 = "torch.prims.convert_element_type"(%35678, %35679) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35680, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35681 = "torch.aten.mul.Tensor"(%18229, %35680) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35681, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35682 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35683 = "torch.prims.convert_element_type"(%35681, %35682) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35683, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35684 = "torch.aten.div.Tensor"(%35683, %18231) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35684, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35685 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35686 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35687 = "torch.aten.clamp"(%35684, %35685, %35686) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35687, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35688 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35689 = "torch.prims.convert_element_type"(%35687, %35688) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35689, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35690 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35691 = "torch.aten.unsqueeze"(%18233, %35690) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %35692 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35693 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35694 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35695 = "torch.prim.ListConstruct"(%35692, %35693, %35694) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35696 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35697 = "torch.aten.expand"(%35691, %35695, %35696) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %35698 = "torch_c.to_builtin_tensor"(%35689) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35699 = "torch_c.to_builtin_tensor"(%35697) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %35700 = "util.call"(%35698, %35699) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %35701 = "torch_c.from_builtin_tensor"(%35700) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35701, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35702 = "torch.aten.div.Tensor"(%35701, %18235) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35702, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35703 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35704 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35705 = "torch.aten.clamp"(%35702, %35703, %35704) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%35705, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %35706 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35707 = "torch.prims.convert_element_type"(%35705, %35706) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35707, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35708 = "torch.aten.div.Tensor"(%35683, %18237) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35708, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35709 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35710 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35711 = "torch.aten.clamp"(%35708, %35709, %35710) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35711, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35712 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35713 = "torch.prims.convert_element_type"(%35711, %35712) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35713, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35714 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35715 = "torch.aten.unsqueeze"(%18239, %35714) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %35716 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35717 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %35718 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35719 = "torch.prim.ListConstruct"(%35716, %35717, %35718) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35720 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35721 = "torch.aten.expand"(%35715, %35719, %35720) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %35722 = "torch_c.to_builtin_tensor"(%35713) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35723 = "torch_c.to_builtin_tensor"(%35721) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %35724 = "util.call"(%35722, %35723) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %35725 = "torch_c.from_builtin_tensor"(%35724) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35725, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35726 = "torch.aten.div.Tensor"(%35725, %18241) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35726, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35727 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35728 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35729 = "torch.aten.clamp"(%35726, %35727, %35728) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35729, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35730 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35731 = "torch.prims.convert_element_type"(%35729, %35730) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35731, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %35732 = "torch.aten.div.Tensor"(%35683, %18243) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35732, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35733 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35734 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35735 = "torch.aten.clamp"(%35732, %35733, %35734) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%35735, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %35736 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35737 = "torch.prims.convert_element_type"(%35735, %35736) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35737, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %35738 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35739 = "torch.aten.unsqueeze"(%18245, %35738) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %35740 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35741 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %35742 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %35743 = "torch.prim.ListConstruct"(%35740, %35741, %35742) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35744 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35745 = "torch.aten.expand"(%35739, %35743, %35744) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %35746 = "torch_c.to_builtin_tensor"(%35737) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %35747 = "torch_c.to_builtin_tensor"(%35745) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %35748 = "util.call"(%35746, %35747) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %35749 = "torch_c.from_builtin_tensor"(%35748) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35749, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35750 = "torch.aten.div.Tensor"(%35749, %18247) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35750, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35751 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %35752 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %35753 = "torch.aten.clamp"(%35750, %35751, %35752) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%35753, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %35754 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %35755 = "torch.prims.convert_element_type"(%35753, %35754) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35755, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %35756 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35757 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %35758 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35759 = "torch.prim.ListConstruct"(%35756, %18481, %35757, %35758) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35760 = "torch.aten.view"(%35707, %35759) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35760, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35761 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35762 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35763 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35764 = "torch.prim.ListConstruct"(%35761, %18481, %35762, %35763) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35765 = "torch.aten.view"(%35731, %35764) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35765, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35766 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35767 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35768 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35769 = "torch.prim.ListConstruct"(%35766, %18481, %35767, %35768) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35770 = "torch.aten.view"(%35755, %35769) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35770, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35771 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35772 = "torch.constant.none"() : () -> !torch.none
    %35773 = "torch.constant.none"() : () -> !torch.none
    %35774 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35775 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35776 = "torch.aten.arange"(%35771, %35772, %35773, %35774, %35775) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %35777 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35778 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35779 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35780 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35781 = "torch.constant.none"() : () -> !torch.none
    %35782 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35783 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35784 = "torch.aten.arange.start_step"(%35777, %35778, %35779, %35780, %35781, %35782, %35783) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %35785 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35786 = "torch.prims.convert_element_type"(%35784, %35785) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %35787 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35788 = "torch.aten.div.Scalar"(%35786, %35787) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35789 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %35790 = "torch.aten.pow.Scalar"(%35789, %35788) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35791 = "torch.aten.reciprocal"(%35790) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35792 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %35793 = "torch.aten.mul.Scalar"(%35791, %35792) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35794 = "torch.aten.reciprocal"(%35793) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35795 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %35796 = "torch.aten.mul.Scalar"(%35794, %35795) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35797 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35798 = "torch.aten.gt.Scalar"(%35796, %35797) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35799 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35800 = "torch.aten.div.Scalar"(%35793, %35799) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35801 = "torch.aten.where.self"(%35798, %35800, %35793) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35802 = "torch.aten.reciprocal"(%35796) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35803 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %35804 = "torch.aten.mul.Scalar"(%35802, %35803) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35805 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35806 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35807 = "torch.aten.sub.Scalar"(%35804, %35805, %35806) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35808 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35809 = "torch.aten.div.Scalar"(%35807, %35808) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35810 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35811 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35812 = "torch.aten.rsub.Scalar"(%35809, %35810, %35811) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35813 = "torch.aten.mul.Tensor"(%35812, %35801) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35814 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35815 = "torch.aten.div.Scalar"(%35813, %35814) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35816 = "torch.aten.mul.Tensor"(%35809, %35801) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35817 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35818 = "torch.aten.add.Tensor"(%35815, %35816, %35817) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35819 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %35820 = "torch.aten.lt.Scalar"(%35796, %35819) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35821 = "torch.aten.bitwise_not"(%35820) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35822 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35823 = "torch.aten.gt.Scalar"(%35796, %35822) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35824 = "torch.aten.bitwise_not"(%35823) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35825 = "torch.aten.mul.Tensor"(%35821, %35824) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35826 = "torch.aten.where.self"(%35825, %35818, %35801) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35827 = "torch.prim.ListConstruct"(%35826, %35826) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %35828 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35829 = "torch.aten.cat"(%35827, %35828) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %35830 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35831 = "torch.prims.convert_element_type"(%35776, %35830) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %35832 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35833 = "torch.prims.convert_element_type"(%35829, %35832) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %35834 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35835 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35836 = "torch.prim.ListConstruct"(%35834, %35835) : (!torch.int, !torch.int) -> !torch.list<int>
    %35837 = "torch.aten.view"(%35831, %35836) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %35838 = "torch.aten.mul.Tensor"(%35837, %35833) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35839 = "torch.aten.cos"(%35838) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35840 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35841 = "torch.prims.convert_element_type"(%35839, %35840) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35842 = "torch.aten.sin"(%35838) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35843 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35844 = "torch.prims.convert_element_type"(%35842, %35843) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35845 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35846 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35847 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35848 = "torch.aten.slice.Tensor"(%35841, %35845, %35846, %18481, %35847) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35848, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35849 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35850 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35851 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35852 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35853 = "torch.aten.slice.Tensor"(%35848, %35849, %35850, %35851, %35852) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35853, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35854 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35855 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35857 = "torch.aten.slice.Tensor"(%35844, %35854, %35855, %18481, %35856) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35857, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35858 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35859 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35860 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35861 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35862 = "torch.aten.slice.Tensor"(%35857, %35858, %35859, %35860, %35861) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35862, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35863 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35864 = "torch.aten.unsqueeze"(%35853, %35863) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35864, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35865 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35866 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35867 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35868 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35869 = "torch.aten.slice.Tensor"(%35864, %35865, %35866, %35867, %35868) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35869, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35870 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35871 = "torch.aten.unsqueeze"(%35869, %35870) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35871, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35872 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35873 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35874 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35875 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35876 = "torch.aten.slice.Tensor"(%35871, %35872, %35873, %35874, %35875) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35876, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35877 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35880 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35881 = "torch.prim.ListConstruct"(%35877, %35878, %35879, %35880) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35882 = "torch.aten.repeat"(%35876, %35881) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35882, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %35883 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35884 = "torch.aten.unsqueeze"(%35862, %35883) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35884, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35885 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35886 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35887 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35889 = "torch.aten.slice.Tensor"(%35884, %35885, %35886, %35887, %35888) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%35889, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %35890 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35891 = "torch.aten.unsqueeze"(%35889, %35890) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35891, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35892 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35894 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35895 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35896 = "torch.aten.slice.Tensor"(%35891, %35892, %35893, %35894, %35895) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35896, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %35897 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35898 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35899 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35900 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35901 = "torch.prim.ListConstruct"(%35897, %35898, %35899, %35900) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %35902 = "torch.aten.repeat"(%35896, %35901) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%35902, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %35903 = "torch.aten.mul.Tensor"(%35760, %35882) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35903, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35904 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35905 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35906 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %35907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35908 = "torch.aten.slice.Tensor"(%35760, %35904, %35905, %35906, %35907) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35908, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35909 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35910 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %35911 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %35912 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35913 = "torch.aten.slice.Tensor"(%35760, %35909, %35910, %35911, %35912) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35913, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35914 = "torch.aten.neg"(%35913) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35914, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %35915 = "torch.prim.ListConstruct"(%35914, %35908) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %35916 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35917 = "torch.aten.cat"(%35915, %35916) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35917, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35918 = "torch.aten.mul.Tensor"(%35917, %35902) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35918, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35919 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35920 = "torch.aten.add.Tensor"(%35903, %35918, %35919) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%35920, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %35921 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35922 = "torch.constant.none"() : () -> !torch.none
    %35923 = "torch.constant.none"() : () -> !torch.none
    %35924 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35925 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35926 = "torch.aten.arange"(%35921, %35922, %35923, %35924, %35925) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %35927 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35928 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35929 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %35930 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %35931 = "torch.constant.none"() : () -> !torch.none
    %35932 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %35933 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %35934 = "torch.aten.arange.start_step"(%35927, %35928, %35929, %35930, %35931, %35932, %35933) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %35935 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35936 = "torch.prims.convert_element_type"(%35934, %35935) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %35937 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %35938 = "torch.aten.div.Scalar"(%35936, %35937) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35939 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %35940 = "torch.aten.pow.Scalar"(%35939, %35938) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35941 = "torch.aten.reciprocal"(%35940) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35942 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %35943 = "torch.aten.mul.Scalar"(%35941, %35942) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35944 = "torch.aten.reciprocal"(%35943) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35945 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %35946 = "torch.aten.mul.Scalar"(%35944, %35945) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %35947 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35948 = "torch.aten.gt.Scalar"(%35946, %35947) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35949 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35950 = "torch.aten.div.Scalar"(%35943, %35949) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35951 = "torch.aten.where.self"(%35948, %35950, %35943) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35952 = "torch.aten.reciprocal"(%35946) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35953 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %35954 = "torch.aten.mul.Scalar"(%35952, %35953) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35955 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35956 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35957 = "torch.aten.sub.Scalar"(%35954, %35955, %35956) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35958 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %35959 = "torch.aten.div.Scalar"(%35957, %35958) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35960 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35961 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35962 = "torch.aten.rsub.Scalar"(%35959, %35960, %35961) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %35963 = "torch.aten.mul.Tensor"(%35962, %35951) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35964 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %35965 = "torch.aten.div.Scalar"(%35963, %35964) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35966 = "torch.aten.mul.Tensor"(%35959, %35951) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35967 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35968 = "torch.aten.add.Tensor"(%35965, %35966, %35967) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %35969 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %35970 = "torch.aten.lt.Scalar"(%35946, %35969) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35971 = "torch.aten.bitwise_not"(%35970) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35972 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %35973 = "torch.aten.gt.Scalar"(%35946, %35972) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %35974 = "torch.aten.bitwise_not"(%35973) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35975 = "torch.aten.mul.Tensor"(%35971, %35974) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %35976 = "torch.aten.where.self"(%35975, %35968, %35951) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %35977 = "torch.prim.ListConstruct"(%35976, %35976) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %35978 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %35979 = "torch.aten.cat"(%35977, %35978) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %35980 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35981 = "torch.prims.convert_element_type"(%35926, %35980) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %35982 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %35983 = "torch.prims.convert_element_type"(%35979, %35982) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %35984 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %35985 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35986 = "torch.prim.ListConstruct"(%35984, %35985) : (!torch.int, !torch.int) -> !torch.list<int>
    %35987 = "torch.aten.view"(%35981, %35986) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %35988 = "torch.aten.mul.Tensor"(%35987, %35983) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35989 = "torch.aten.cos"(%35988) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35990 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35991 = "torch.prims.convert_element_type"(%35989, %35990) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35992 = "torch.aten.sin"(%35988) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %35993 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %35994 = "torch.prims.convert_element_type"(%35992, %35993) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %35995 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35996 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %35997 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %35998 = "torch.aten.slice.Tensor"(%35991, %35995, %35996, %18481, %35997) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%35998, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %35999 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36000 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36001 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36002 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36003 = "torch.aten.slice.Tensor"(%35998, %35999, %36000, %36001, %36002) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36003, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36004 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36005 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36006 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36007 = "torch.aten.slice.Tensor"(%35994, %36004, %36005, %18481, %36006) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36007, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36008 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36009 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36010 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36011 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36012 = "torch.aten.slice.Tensor"(%36007, %36008, %36009, %36010, %36011) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36012, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36013 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36014 = "torch.aten.unsqueeze"(%36003, %36013) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36014, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36015 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36016 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36017 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36018 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36019 = "torch.aten.slice.Tensor"(%36014, %36015, %36016, %36017, %36018) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36019, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36020 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36021 = "torch.aten.unsqueeze"(%36019, %36020) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36021, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36022 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36023 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36024 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36025 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36026 = "torch.aten.slice.Tensor"(%36021, %36022, %36023, %36024, %36025) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36026, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36027 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36028 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36029 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36030 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36031 = "torch.prim.ListConstruct"(%36027, %36028, %36029, %36030) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36032 = "torch.aten.repeat"(%36026, %36031) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36032, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %36033 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36034 = "torch.aten.unsqueeze"(%36012, %36033) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36034, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36035 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36036 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36037 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36038 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36039 = "torch.aten.slice.Tensor"(%36034, %36035, %36036, %36037, %36038) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36039, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36040 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36041 = "torch.aten.unsqueeze"(%36039, %36040) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36041, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36042 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36043 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36044 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36045 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36046 = "torch.aten.slice.Tensor"(%36041, %36042, %36043, %36044, %36045) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36046, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36047 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36048 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36049 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36050 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36051 = "torch.prim.ListConstruct"(%36047, %36048, %36049, %36050) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36052 = "torch.aten.repeat"(%36046, %36051) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36052, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %36053 = "torch.aten.mul.Tensor"(%35765, %36032) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36053, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36054 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36055 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36056 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36057 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36058 = "torch.aten.slice.Tensor"(%35765, %36054, %36055, %36056, %36057) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36058, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36059 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36060 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36061 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36062 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36063 = "torch.aten.slice.Tensor"(%35765, %36059, %36060, %36061, %36062) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36063, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36064 = "torch.aten.neg"(%36063) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36064, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36065 = "torch.prim.ListConstruct"(%36064, %36058) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %36066 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36067 = "torch.aten.cat"(%36065, %36066) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36067, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36068 = "torch.aten.mul.Tensor"(%36067, %36052) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36068, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36069 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36070 = "torch.aten.add.Tensor"(%36053, %36068, %36069) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36070, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36071 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36072 = "torch.aten.mul.Scalar"(%arg69, %36071) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%36072, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %36073 = "torch.constant.int"() <{value = 52 : i64}> : () -> !torch.int
    %36074 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36075 = "torch.aten.add.Scalar"(%36072, %36073, %36074) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%36075, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %36076 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36077 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36078 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36079 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36080 = "torch.prim.ListConstruct"(%36076, %18477, %36077, %36078, %36079) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36081 = "torch.aten.view"(%36070, %36080) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36081, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36082 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36083 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36084 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36085 = "torch.prim.ListConstruct"(%19011, %36082, %36083, %36084) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36086 = "torch.aten.view"(%36081, %36085) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36086, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36087 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %36088 = "torch.aten.view"(%36075, %36087) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%36088, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %36089 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36090 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36091 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36092 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36093 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36094 = "torch.prim.ListConstruct"(%18479, %36089, %36090, %36091, %36092, %36093) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36095 = "torch.aten.view"(%35497, %36094) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36095, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36096 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36097 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36098 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36099 = "torch.prim.ListConstruct"(%18993, %36096, %36097, %36098) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36100 = "torch.aten.view"(%36095, %36099) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36100, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36101 = "torch.prim.ListConstruct"(%36088) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %36102 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36103 = "torch.aten.index_put"(%36100, %36101, %36086, %36102) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36103, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36104 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36105 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36106 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36107 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36108 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36109 = "torch.prim.ListConstruct"(%18479, %36104, %36105, %36106, %36107, %36108) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36110 = "torch.aten.view"(%36103, %36109) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36110, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36111 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %36112 = "torch.prim.ListConstruct"(%18479, %36111) : (!torch.int, !torch.int) -> !torch.list<int>
    %36113 = "torch.aten.view"(%36110, %36112) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36113, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %36114 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36115 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36116 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36117 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36118 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36119 = "torch.prim.ListConstruct"(%18479, %36114, %36115, %36116, %36117, %36118) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36120 = "torch.aten.view"(%36113, %36119) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36120, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36121 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36122 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36123 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36124 = "torch.prim.ListConstruct"(%18993, %36121, %36122, %36123) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36125 = "torch.aten.view"(%36120, %36124) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36125, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36126 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36127 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36128 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36129 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36130 = "torch.prim.ListConstruct"(%36126, %18477, %36127, %36128, %36129) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36131 = "torch.aten.view"(%35770, %36130) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36131, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36132 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36133 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36134 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36135 = "torch.prim.ListConstruct"(%19011, %36132, %36133, %36134) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36136 = "torch.aten.view"(%36131, %36135) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36136, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36137 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36138 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36139 = "torch.aten.add.Scalar"(%36075, %36137, %36138) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%36139, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %36140 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %36141 = "torch.aten.view"(%36139, %36140) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%36141, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %36142 = "torch.prim.ListConstruct"(%36141) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %36143 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36144 = "torch.aten.index_put"(%36125, %36142, %36136, %36143) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36144, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36145 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36146 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36147 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36148 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36149 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36150 = "torch.prim.ListConstruct"(%18479, %36145, %36146, %36147, %36148, %36149) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36151 = "torch.aten.view"(%36144, %36150) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36151, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36152 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %36153 = "torch.prim.ListConstruct"(%18479, %36152) : (!torch.int, !torch.int) -> !torch.list<int>
    %36154 = "torch.aten.view"(%36151, %36153) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36154, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %36155 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %36156 = "torch.aten.unsqueeze"(%36070, %36155) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36156, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36157 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36158 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36159 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36160 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36161 = "torch.prim.ListConstruct"(%36157, %18481, %36158, %36159, %36160) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36162 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36163 = "torch.aten.expand"(%36156, %36161, %36162) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36163, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36164 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36165 = "torch.aten.clone"(%36163, %36164) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36165, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36166 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36167 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36168 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36169 = "torch.prim.ListConstruct"(%36166, %18481, %36167, %36168) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36170 = "torch.aten._unsafe_view"(%36165, %36169) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36170, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36171 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %36172 = "torch.aten.unsqueeze"(%35770, %36171) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36172, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36173 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36174 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36175 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36176 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36177 = "torch.prim.ListConstruct"(%36173, %18481, %36174, %36175, %36176) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36178 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36179 = "torch.aten.expand"(%36172, %36177, %36178) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36179, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36180 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36181 = "torch.aten.clone"(%36179, %36180) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36181, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36182 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36183 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36184 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36185 = "torch.prim.ListConstruct"(%36182, %18481, %36183, %36184) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36186 = "torch.aten._unsafe_view"(%36181, %36185) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36186, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36187 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36188 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36189 = "torch.aten.transpose.int"(%35920, %36187, %36188) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36189, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36190 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36191 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36192 = "torch.aten.transpose.int"(%36170, %36190, %36191) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36192, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36193 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36194 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36195 = "torch.aten.transpose.int"(%36186, %36193, %36194) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36195, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36196 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36197 = "torch.aten.squeeze.dim"(%18570, %36196) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36197, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %36198 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36199 = "torch.aten.squeeze.dim"(%36197, %36198) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36199, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %36200 = "torch_c.to_builtin_tensor"(%36189) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %36201 = "torch_c.to_builtin_tensor"(%36192) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %36202 = "torch_c.to_builtin_tensor"(%36195) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %36203 = "torch_c.to_builtin_tensor"(%36199) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %36204 = "tensor.cast"(%36203) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %36205 = "torch_c.to_builtin_tensor"(%18249) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %36206 = "util.call"(%36200, %36201, %36202, %36205, %36204) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %36207 = "torch_c.from_builtin_tensor"(%36206) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%36207, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %36208 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36209 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36210 = "torch.aten.transpose.int"(%36207, %36208, %36209) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%36210, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %36211 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36212 = "torch.aten.clone"(%36210, %36211) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%36212, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %36213 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36214 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36215 = "torch.prim.ListConstruct"(%36213, %18481, %36214) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36216 = "torch.aten._unsafe_view"(%36212, %36215) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36216, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36217 = "torch.aten.div.Tensor"(%36216, %18251) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36217, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36218 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36219 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36220 = "torch.aten.clamp"(%36217, %36218, %36219) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36220, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36221 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36222 = "torch.prims.convert_element_type"(%36220, %36221) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36222, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36223 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36224 = "torch.aten.unsqueeze"(%18253, %36223) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %36225 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36226 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36227 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36228 = "torch.prim.ListConstruct"(%36225, %36226, %36227) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36229 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36230 = "torch.aten.expand"(%36224, %36228, %36229) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %36231 = "torch_c.to_builtin_tensor"(%36222) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36232 = "torch_c.to_builtin_tensor"(%36230) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %36233 = "util.call"(%36231, %36232) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %36234 = "torch_c.from_builtin_tensor"(%36233) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36234, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36235 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36236 = "torch.prims.convert_element_type"(%36234, %36235) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36236, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36237 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36238 = "torch.aten.add.Tensor"(%35664, %36236, %36237) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36238, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36239 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36240 = "torch.prims.convert_element_type"(%36238, %36239) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36240, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36241 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36242 = "torch.aten.pow.Tensor_Scalar"(%36240, %36241) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36242, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36243 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36244 = "torch.prim.ListConstruct"(%36243) : (!torch.int) -> !torch.list<int>
    %36245 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %36246 = "torch.constant.none"() : () -> !torch.none
    %36247 = "torch.aten.mean.dim"(%36242, %36244, %36245, %36246) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36247, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36248 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %36249 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36250 = "torch.aten.add.Scalar"(%36247, %36248, %36249) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36250, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36251 = "torch.aten.rsqrt"(%36250) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36251, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36252 = "torch.aten.mul.Tensor"(%36240, %36251) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36252, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36253 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36254 = "torch.prims.convert_element_type"(%36252, %36253) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36254, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36255 = "torch.aten.mul.Tensor"(%18255, %36254) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36255, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36256 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36257 = "torch.prims.convert_element_type"(%36255, %36256) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36257, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36258 = "torch.aten.div.Tensor"(%36257, %18257) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36258, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36259 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36260 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36261 = "torch.aten.clamp"(%36258, %36259, %36260) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36261, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36262 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36263 = "torch.prims.convert_element_type"(%36261, %36262) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36263, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36264 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36265 = "torch.aten.unsqueeze"(%18259, %36264) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %36266 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36267 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %36268 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36269 = "torch.prim.ListConstruct"(%36266, %36267, %36268) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36270 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36271 = "torch.aten.expand"(%36265, %36269, %36270) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %36272 = "torch_c.to_builtin_tensor"(%36263) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36273 = "torch_c.to_builtin_tensor"(%36271) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %36274 = "util.call"(%36272, %36273) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %36275 = "torch_c.from_builtin_tensor"(%36274) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%36275, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %36276 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36277 = "torch.prims.convert_element_type"(%36275, %36276) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36277, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36278 = "torch.aten.silu"(%36277) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36278, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36279 = "torch.aten.div.Tensor"(%36257, %18261) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36279, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36280 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36281 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36282 = "torch.aten.clamp"(%36279, %36280, %36281) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36282, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36283 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36284 = "torch.prims.convert_element_type"(%36282, %36283) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36284, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36285 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36286 = "torch.aten.unsqueeze"(%18263, %36285) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %36287 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36288 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %36289 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36290 = "torch.prim.ListConstruct"(%36287, %36288, %36289) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36291 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36292 = "torch.aten.expand"(%36286, %36290, %36291) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %36293 = "torch_c.to_builtin_tensor"(%36284) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36294 = "torch_c.to_builtin_tensor"(%36292) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %36295 = "util.call"(%36293, %36294) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %36296 = "torch_c.from_builtin_tensor"(%36295) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%36296, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %36297 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36298 = "torch.prims.convert_element_type"(%36296, %36297) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36298, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36299 = "torch.aten.mul.Tensor"(%36278, %36298) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36299, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36300 = "torch.aten.div.Tensor"(%36299, %18265) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36300, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36301 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36302 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36303 = "torch.aten.clamp"(%36300, %36301, %36302) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36303, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36304 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36305 = "torch.prims.convert_element_type"(%36303, %36304) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36305, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %36306 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36307 = "torch.aten.unsqueeze"(%18267, %36306) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %36308 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36309 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36310 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %36311 = "torch.prim.ListConstruct"(%36308, %36309, %36310) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36312 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36313 = "torch.aten.expand"(%36307, %36311, %36312) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %36314 = "torch_c.to_builtin_tensor"(%36305) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %36315 = "torch_c.to_builtin_tensor"(%36313) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %36316 = "util.call"(%36314, %36315) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %36317 = "torch_c.from_builtin_tensor"(%36316) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36317, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36318 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36319 = "torch.prims.convert_element_type"(%36317, %36318) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36319, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36320 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36321 = "torch.aten.add.Tensor"(%36238, %36319, %36320) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36321, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36322 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36323 = "torch.prims.convert_element_type"(%36321, %36322) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36323, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36324 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36325 = "torch.aten.pow.Tensor_Scalar"(%36323, %36324) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36325, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36326 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36327 = "torch.prim.ListConstruct"(%36326) : (!torch.int) -> !torch.list<int>
    %36328 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %36329 = "torch.constant.none"() : () -> !torch.none
    %36330 = "torch.aten.mean.dim"(%36325, %36327, %36328, %36329) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36330, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36331 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %36332 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36333 = "torch.aten.add.Scalar"(%36330, %36331, %36332) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36333, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36334 = "torch.aten.rsqrt"(%36333) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36334, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36335 = "torch.aten.mul.Tensor"(%36323, %36334) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36335, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36336 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36337 = "torch.prims.convert_element_type"(%36335, %36336) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36337, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36338 = "torch.aten.mul.Tensor"(%18269, %36337) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36338, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36339 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36340 = "torch.prims.convert_element_type"(%36338, %36339) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36340, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36341 = "torch.aten.div.Tensor"(%36340, %18271) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36341, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36342 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36343 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36344 = "torch.aten.clamp"(%36341, %36342, %36343) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36344, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36345 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36346 = "torch.prims.convert_element_type"(%36344, %36345) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36346, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36347 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36348 = "torch.aten.unsqueeze"(%18273, %36347) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %36349 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36350 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36351 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36352 = "torch.prim.ListConstruct"(%36349, %36350, %36351) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36353 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36354 = "torch.aten.expand"(%36348, %36352, %36353) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %36355 = "torch_c.to_builtin_tensor"(%36346) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36356 = "torch_c.to_builtin_tensor"(%36354) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %36357 = "util.call"(%36355, %36356) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %36358 = "torch_c.from_builtin_tensor"(%36357) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36358, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36359 = "torch.aten.div.Tensor"(%36358, %18275) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36359, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36360 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36361 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36362 = "torch.aten.clamp"(%36359, %36360, %36361) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36362, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36363 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36364 = "torch.prims.convert_element_type"(%36362, %36363) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36364, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36365 = "torch.aten.div.Tensor"(%36340, %18277) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36365, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36366 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36367 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36368 = "torch.aten.clamp"(%36365, %36366, %36367) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36368, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36369 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36370 = "torch.prims.convert_element_type"(%36368, %36369) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36370, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36371 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36372 = "torch.aten.unsqueeze"(%18279, %36371) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %36373 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36374 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %36375 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36376 = "torch.prim.ListConstruct"(%36373, %36374, %36375) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36377 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36378 = "torch.aten.expand"(%36372, %36376, %36377) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %36379 = "torch_c.to_builtin_tensor"(%36370) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36380 = "torch_c.to_builtin_tensor"(%36378) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %36381 = "util.call"(%36379, %36380) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %36382 = "torch_c.from_builtin_tensor"(%36381) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%36382, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %36383 = "torch.aten.div.Tensor"(%36382, %18281) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%36383, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %36384 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36385 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36386 = "torch.aten.clamp"(%36383, %36384, %36385) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%36386, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %36387 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36388 = "torch.prims.convert_element_type"(%36386, %36387) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36388, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %36389 = "torch.aten.div.Tensor"(%36340, %18283) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36389, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36390 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36391 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36392 = "torch.aten.clamp"(%36389, %36390, %36391) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36392, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36393 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36394 = "torch.prims.convert_element_type"(%36392, %36393) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36394, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36395 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36396 = "torch.aten.unsqueeze"(%18285, %36395) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %36397 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36398 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %36399 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36400 = "torch.prim.ListConstruct"(%36397, %36398, %36399) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36401 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36402 = "torch.aten.expand"(%36396, %36400, %36401) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %36403 = "torch_c.to_builtin_tensor"(%36394) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36404 = "torch_c.to_builtin_tensor"(%36402) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %36405 = "util.call"(%36403, %36404) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %36406 = "torch_c.from_builtin_tensor"(%36405) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%36406, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %36407 = "torch.aten.div.Tensor"(%36406, %18287) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%36407, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %36408 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36409 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36410 = "torch.aten.clamp"(%36407, %36408, %36409) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%36410, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %36411 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36412 = "torch.prims.convert_element_type"(%36410, %36411) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36412, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %36413 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36414 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36415 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36416 = "torch.prim.ListConstruct"(%36413, %18481, %36414, %36415) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36417 = "torch.aten.view"(%36364, %36416) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36417, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36418 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36419 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36420 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36421 = "torch.prim.ListConstruct"(%36418, %18481, %36419, %36420) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36422 = "torch.aten.view"(%36388, %36421) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36422, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36423 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36424 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36425 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36426 = "torch.prim.ListConstruct"(%36423, %18481, %36424, %36425) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36427 = "torch.aten.view"(%36412, %36426) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36427, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36428 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %36429 = "torch.constant.none"() : () -> !torch.none
    %36430 = "torch.constant.none"() : () -> !torch.none
    %36431 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %36432 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36433 = "torch.aten.arange"(%36428, %36429, %36430, %36431, %36432) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %36434 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36435 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36436 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36437 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36438 = "torch.constant.none"() : () -> !torch.none
    %36439 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %36440 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36441 = "torch.aten.arange.start_step"(%36434, %36435, %36436, %36437, %36438, %36439, %36440) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %36442 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36443 = "torch.prims.convert_element_type"(%36441, %36442) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %36444 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36445 = "torch.aten.div.Scalar"(%36443, %36444) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36446 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %36447 = "torch.aten.pow.Scalar"(%36446, %36445) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36448 = "torch.aten.reciprocal"(%36447) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36449 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %36450 = "torch.aten.mul.Scalar"(%36448, %36449) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %36451 = "torch.aten.reciprocal"(%36450) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36452 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %36453 = "torch.aten.mul.Scalar"(%36451, %36452) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %36454 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %36455 = "torch.aten.gt.Scalar"(%36453, %36454) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %36456 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36457 = "torch.aten.div.Scalar"(%36450, %36456) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36458 = "torch.aten.where.self"(%36455, %36457, %36450) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36459 = "torch.aten.reciprocal"(%36453) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36460 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %36461 = "torch.aten.mul.Scalar"(%36459, %36460) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36462 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36463 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36464 = "torch.aten.sub.Scalar"(%36461, %36462, %36463) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %36465 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36466 = "torch.aten.div.Scalar"(%36464, %36465) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36467 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36468 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36469 = "torch.aten.rsub.Scalar"(%36466, %36467, %36468) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %36470 = "torch.aten.mul.Tensor"(%36469, %36458) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36471 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36472 = "torch.aten.div.Scalar"(%36470, %36471) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36473 = "torch.aten.mul.Tensor"(%36466, %36458) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36474 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36475 = "torch.aten.add.Tensor"(%36472, %36473, %36474) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36476 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %36477 = "torch.aten.lt.Scalar"(%36453, %36476) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %36478 = "torch.aten.bitwise_not"(%36477) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %36479 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %36480 = "torch.aten.gt.Scalar"(%36453, %36479) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %36481 = "torch.aten.bitwise_not"(%36480) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %36482 = "torch.aten.mul.Tensor"(%36478, %36481) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %36483 = "torch.aten.where.self"(%36482, %36475, %36458) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36484 = "torch.prim.ListConstruct"(%36483, %36483) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %36485 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36486 = "torch.aten.cat"(%36484, %36485) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %36487 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36488 = "torch.prims.convert_element_type"(%36433, %36487) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %36489 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36490 = "torch.prims.convert_element_type"(%36486, %36489) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %36491 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %36492 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36493 = "torch.prim.ListConstruct"(%36491, %36492) : (!torch.int, !torch.int) -> !torch.list<int>
    %36494 = "torch.aten.view"(%36488, %36493) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %36495 = "torch.aten.mul.Tensor"(%36494, %36490) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %36496 = "torch.aten.cos"(%36495) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %36497 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36498 = "torch.prims.convert_element_type"(%36496, %36497) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %36499 = "torch.aten.sin"(%36495) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %36500 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36501 = "torch.prims.convert_element_type"(%36499, %36500) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %36502 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36503 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36504 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36505 = "torch.aten.slice.Tensor"(%36498, %36502, %36503, %18481, %36504) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36505, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36507 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36508 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36509 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36510 = "torch.aten.slice.Tensor"(%36505, %36506, %36507, %36508, %36509) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36510, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36511 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36512 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36513 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36514 = "torch.aten.slice.Tensor"(%36501, %36511, %36512, %18481, %36513) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36514, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36515 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36516 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36517 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36518 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36519 = "torch.aten.slice.Tensor"(%36514, %36515, %36516, %36517, %36518) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36519, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36520 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36521 = "torch.aten.unsqueeze"(%36510, %36520) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36521, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36522 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36523 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36524 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36525 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36526 = "torch.aten.slice.Tensor"(%36521, %36522, %36523, %36524, %36525) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36526, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36527 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36528 = "torch.aten.unsqueeze"(%36526, %36527) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36528, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36529 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36530 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36531 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36532 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36533 = "torch.aten.slice.Tensor"(%36528, %36529, %36530, %36531, %36532) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36533, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36534 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36537 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36538 = "torch.prim.ListConstruct"(%36534, %36535, %36536, %36537) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36539 = "torch.aten.repeat"(%36533, %36538) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36539, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %36540 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36541 = "torch.aten.unsqueeze"(%36519, %36540) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36541, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36543 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36544 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36545 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36546 = "torch.aten.slice.Tensor"(%36541, %36542, %36543, %36544, %36545) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36546, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36547 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36548 = "torch.aten.unsqueeze"(%36546, %36547) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36548, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36549 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36550 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36551 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36552 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36553 = "torch.aten.slice.Tensor"(%36548, %36549, %36550, %36551, %36552) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36553, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36554 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36555 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36556 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36557 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36558 = "torch.prim.ListConstruct"(%36554, %36555, %36556, %36557) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36559 = "torch.aten.repeat"(%36553, %36558) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36559, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %36560 = "torch.aten.mul.Tensor"(%36417, %36539) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36560, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36561 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36562 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36563 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36565 = "torch.aten.slice.Tensor"(%36417, %36561, %36562, %36563, %36564) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36565, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36566 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36567 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36568 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36569 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36570 = "torch.aten.slice.Tensor"(%36417, %36566, %36567, %36568, %36569) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36570, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36571 = "torch.aten.neg"(%36570) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36571, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36572 = "torch.prim.ListConstruct"(%36571, %36565) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %36573 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36574 = "torch.aten.cat"(%36572, %36573) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36574, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36575 = "torch.aten.mul.Tensor"(%36574, %36559) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36575, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36576 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36577 = "torch.aten.add.Tensor"(%36560, %36575, %36576) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36577, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36578 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %36579 = "torch.constant.none"() : () -> !torch.none
    %36580 = "torch.constant.none"() : () -> !torch.none
    %36581 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %36582 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36583 = "torch.aten.arange"(%36578, %36579, %36580, %36581, %36582) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %36584 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36585 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36586 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36587 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36588 = "torch.constant.none"() : () -> !torch.none
    %36589 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %36590 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36591 = "torch.aten.arange.start_step"(%36584, %36585, %36586, %36587, %36588, %36589, %36590) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %36592 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36593 = "torch.prims.convert_element_type"(%36591, %36592) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %36594 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36595 = "torch.aten.div.Scalar"(%36593, %36594) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36596 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %36597 = "torch.aten.pow.Scalar"(%36596, %36595) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36598 = "torch.aten.reciprocal"(%36597) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36599 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %36600 = "torch.aten.mul.Scalar"(%36598, %36599) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %36601 = "torch.aten.reciprocal"(%36600) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36602 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %36603 = "torch.aten.mul.Scalar"(%36601, %36602) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %36604 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %36605 = "torch.aten.gt.Scalar"(%36603, %36604) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %36606 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36607 = "torch.aten.div.Scalar"(%36600, %36606) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36608 = "torch.aten.where.self"(%36605, %36607, %36600) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36609 = "torch.aten.reciprocal"(%36603) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36610 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %36611 = "torch.aten.mul.Scalar"(%36609, %36610) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36612 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36613 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36614 = "torch.aten.sub.Scalar"(%36611, %36612, %36613) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %36615 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36616 = "torch.aten.div.Scalar"(%36614, %36615) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36617 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36618 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36619 = "torch.aten.rsub.Scalar"(%36616, %36617, %36618) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %36620 = "torch.aten.mul.Tensor"(%36619, %36608) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36621 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36622 = "torch.aten.div.Scalar"(%36620, %36621) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36623 = "torch.aten.mul.Tensor"(%36616, %36608) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36624 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36625 = "torch.aten.add.Tensor"(%36622, %36623, %36624) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %36626 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %36627 = "torch.aten.lt.Scalar"(%36603, %36626) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %36628 = "torch.aten.bitwise_not"(%36627) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %36629 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %36630 = "torch.aten.gt.Scalar"(%36603, %36629) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %36631 = "torch.aten.bitwise_not"(%36630) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %36632 = "torch.aten.mul.Tensor"(%36628, %36631) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %36633 = "torch.aten.where.self"(%36632, %36625, %36608) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %36634 = "torch.prim.ListConstruct"(%36633, %36633) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %36635 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36636 = "torch.aten.cat"(%36634, %36635) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %36637 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36638 = "torch.prims.convert_element_type"(%36583, %36637) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %36639 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36640 = "torch.prims.convert_element_type"(%36636, %36639) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %36641 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %36642 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36643 = "torch.prim.ListConstruct"(%36641, %36642) : (!torch.int, !torch.int) -> !torch.list<int>
    %36644 = "torch.aten.view"(%36638, %36643) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %36645 = "torch.aten.mul.Tensor"(%36644, %36640) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %36646 = "torch.aten.cos"(%36645) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %36647 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36648 = "torch.prims.convert_element_type"(%36646, %36647) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %36649 = "torch.aten.sin"(%36645) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %36650 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36651 = "torch.prims.convert_element_type"(%36649, %36650) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %36652 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36653 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36654 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36655 = "torch.aten.slice.Tensor"(%36648, %36652, %36653, %18481, %36654) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36655, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36656 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36657 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36658 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36659 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36660 = "torch.aten.slice.Tensor"(%36655, %36656, %36657, %36658, %36659) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36660, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36661 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36662 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36663 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36664 = "torch.aten.slice.Tensor"(%36651, %36661, %36662, %18481, %36663) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36664, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36665 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36666 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36667 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36668 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36669 = "torch.aten.slice.Tensor"(%36664, %36665, %36666, %36667, %36668) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%36669, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %36670 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36671 = "torch.aten.unsqueeze"(%36660, %36670) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36671, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36672 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36673 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36674 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36675 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36676 = "torch.aten.slice.Tensor"(%36671, %36672, %36673, %36674, %36675) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36676, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36677 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36678 = "torch.aten.unsqueeze"(%36676, %36677) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36678, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36679 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36680 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36681 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36682 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36683 = "torch.aten.slice.Tensor"(%36678, %36679, %36680, %36681, %36682) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36683, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36684 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36685 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36686 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36687 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36688 = "torch.prim.ListConstruct"(%36684, %36685, %36686, %36687) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36689 = "torch.aten.repeat"(%36683, %36688) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36689, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %36690 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36691 = "torch.aten.unsqueeze"(%36669, %36690) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36691, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36692 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36693 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36694 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36695 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36696 = "torch.aten.slice.Tensor"(%36691, %36692, %36693, %36694, %36695) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%36696, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %36697 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36698 = "torch.aten.unsqueeze"(%36696, %36697) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36698, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36699 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36700 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36701 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36702 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36703 = "torch.aten.slice.Tensor"(%36698, %36699, %36700, %36701, %36702) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36703, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %36704 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36705 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36706 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36707 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36708 = "torch.prim.ListConstruct"(%36704, %36705, %36706, %36707) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36709 = "torch.aten.repeat"(%36703, %36708) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%36709, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %36710 = "torch.aten.mul.Tensor"(%36422, %36689) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36710, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36711 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36712 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36713 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36714 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36715 = "torch.aten.slice.Tensor"(%36422, %36711, %36712, %36713, %36714) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36715, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36716 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %36717 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36718 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %36719 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36720 = "torch.aten.slice.Tensor"(%36422, %36716, %36717, %36718, %36719) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36720, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36721 = "torch.aten.neg"(%36720) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36721, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %36722 = "torch.prim.ListConstruct"(%36721, %36715) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %36723 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36724 = "torch.aten.cat"(%36722, %36723) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36724, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36725 = "torch.aten.mul.Tensor"(%36724, %36709) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36725, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36726 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36727 = "torch.aten.add.Tensor"(%36710, %36725, %36726) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36727, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36728 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %36729 = "torch.aten.mul.Scalar"(%arg69, %36728) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%36729, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %36730 = "torch.constant.int"() <{value = 54 : i64}> : () -> !torch.int
    %36731 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36732 = "torch.aten.add.Scalar"(%36729, %36730, %36731) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%36732, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %36733 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36734 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36735 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36736 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36737 = "torch.prim.ListConstruct"(%36733, %18477, %36734, %36735, %36736) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36738 = "torch.aten.view"(%36727, %36737) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36738, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36739 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36740 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36741 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36742 = "torch.prim.ListConstruct"(%19011, %36739, %36740, %36741) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36743 = "torch.aten.view"(%36738, %36742) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36743, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36744 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %36745 = "torch.aten.view"(%36732, %36744) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%36745, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %36746 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36747 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36748 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36749 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36750 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36751 = "torch.prim.ListConstruct"(%18479, %36746, %36747, %36748, %36749, %36750) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36752 = "torch.aten.view"(%36154, %36751) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36752, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36753 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36754 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36755 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36756 = "torch.prim.ListConstruct"(%18993, %36753, %36754, %36755) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36757 = "torch.aten.view"(%36752, %36756) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36757, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36758 = "torch.prim.ListConstruct"(%36745) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %36759 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36760 = "torch.aten.index_put"(%36757, %36758, %36743, %36759) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36760, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36761 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36762 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36763 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36764 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36765 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36766 = "torch.prim.ListConstruct"(%18479, %36761, %36762, %36763, %36764, %36765) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36767 = "torch.aten.view"(%36760, %36766) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36767, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36768 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %36769 = "torch.prim.ListConstruct"(%18479, %36768) : (!torch.int, !torch.int) -> !torch.list<int>
    %36770 = "torch.aten.view"(%36767, %36769) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36770, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %36771 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36772 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36773 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36774 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36775 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36776 = "torch.prim.ListConstruct"(%18479, %36771, %36772, %36773, %36774, %36775) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36777 = "torch.aten.view"(%36770, %36776) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36777, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36778 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36779 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36780 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36781 = "torch.prim.ListConstruct"(%18993, %36778, %36779, %36780) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36782 = "torch.aten.view"(%36777, %36781) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36782, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36783 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36784 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36785 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36786 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36787 = "torch.prim.ListConstruct"(%36783, %18477, %36784, %36785, %36786) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36788 = "torch.aten.view"(%36427, %36787) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36788, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36789 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36790 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36791 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36792 = "torch.prim.ListConstruct"(%19011, %36789, %36790, %36791) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36793 = "torch.aten.view"(%36788, %36792) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36793, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36794 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36795 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36796 = "torch.aten.add.Scalar"(%36732, %36794, %36795) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%36796, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %36797 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %36798 = "torch.aten.view"(%36796, %36797) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%36798, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %36799 = "torch.prim.ListConstruct"(%36798) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %36800 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36801 = "torch.aten.index_put"(%36782, %36799, %36793, %36800) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36801, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36802 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36803 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36804 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36805 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36806 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36807 = "torch.prim.ListConstruct"(%18479, %36802, %36803, %36804, %36805, %36806) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36808 = "torch.aten.view"(%36801, %36807) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36808, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36809 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %36810 = "torch.prim.ListConstruct"(%18479, %36809) : (!torch.int, !torch.int) -> !torch.list<int>
    %36811 = "torch.aten.view"(%36808, %36810) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36811, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %36812 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %36813 = "torch.aten.unsqueeze"(%36727, %36812) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36813, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36814 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36815 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36816 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36817 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36818 = "torch.prim.ListConstruct"(%36814, %18481, %36815, %36816, %36817) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36819 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36820 = "torch.aten.expand"(%36813, %36818, %36819) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36820, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36821 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36822 = "torch.aten.clone"(%36820, %36821) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36822, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36823 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36824 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36825 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36826 = "torch.prim.ListConstruct"(%36823, %18481, %36824, %36825) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36827 = "torch.aten._unsafe_view"(%36822, %36826) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36827, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36828 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %36829 = "torch.aten.unsqueeze"(%36427, %36828) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36829, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36830 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36831 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %36832 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36833 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36834 = "torch.prim.ListConstruct"(%36830, %18481, %36831, %36832, %36833) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36835 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36836 = "torch.aten.expand"(%36829, %36834, %36835) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36836, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36837 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36838 = "torch.aten.clone"(%36836, %36837) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36838, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36839 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36840 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %36841 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %36842 = "torch.prim.ListConstruct"(%36839, %18481, %36840, %36841) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36843 = "torch.aten._unsafe_view"(%36838, %36842) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36843, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36844 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36845 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36846 = "torch.aten.transpose.int"(%36577, %36844, %36845) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36846, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36847 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36848 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36849 = "torch.aten.transpose.int"(%36827, %36847, %36848) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36849, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36850 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36851 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36852 = "torch.aten.transpose.int"(%36843, %36850, %36851) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36852, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %36853 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36854 = "torch.aten.squeeze.dim"(%18570, %36853) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36854, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %36855 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36856 = "torch.aten.squeeze.dim"(%36854, %36855) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36856, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %36857 = "torch_c.to_builtin_tensor"(%36846) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %36858 = "torch_c.to_builtin_tensor"(%36849) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %36859 = "torch_c.to_builtin_tensor"(%36852) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %36860 = "torch_c.to_builtin_tensor"(%36856) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %36861 = "tensor.cast"(%36860) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %36862 = "torch_c.to_builtin_tensor"(%18289) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %36863 = "util.call"(%36857, %36858, %36859, %36862, %36861) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %36864 = "torch_c.from_builtin_tensor"(%36863) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%36864, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %36865 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36866 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36867 = "torch.aten.transpose.int"(%36864, %36865, %36866) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%36867, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %36868 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36869 = "torch.aten.clone"(%36867, %36868) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%36869, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %36870 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36871 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36872 = "torch.prim.ListConstruct"(%36870, %18481, %36871) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36873 = "torch.aten._unsafe_view"(%36869, %36872) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36873, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36874 = "torch.aten.div.Tensor"(%36873, %18291) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36874, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36875 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36876 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36877 = "torch.aten.clamp"(%36874, %36875, %36876) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36877, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36878 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36879 = "torch.prims.convert_element_type"(%36877, %36878) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36879, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36880 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36881 = "torch.aten.unsqueeze"(%18293, %36880) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %36882 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36883 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36884 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36885 = "torch.prim.ListConstruct"(%36882, %36883, %36884) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36886 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36887 = "torch.aten.expand"(%36881, %36885, %36886) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %36888 = "torch_c.to_builtin_tensor"(%36879) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36889 = "torch_c.to_builtin_tensor"(%36887) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %36890 = "util.call"(%36888, %36889) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %36891 = "torch_c.from_builtin_tensor"(%36890) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36891, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36892 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36893 = "torch.prims.convert_element_type"(%36891, %36892) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36893, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36894 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36895 = "torch.aten.add.Tensor"(%36321, %36893, %36894) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36895, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36896 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36897 = "torch.prims.convert_element_type"(%36895, %36896) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36897, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36898 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36899 = "torch.aten.pow.Tensor_Scalar"(%36897, %36898) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36899, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36900 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36901 = "torch.prim.ListConstruct"(%36900) : (!torch.int) -> !torch.list<int>
    %36902 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %36903 = "torch.constant.none"() : () -> !torch.none
    %36904 = "torch.aten.mean.dim"(%36899, %36901, %36902, %36903) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36904, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36905 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %36906 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36907 = "torch.aten.add.Scalar"(%36904, %36905, %36906) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36907, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36908 = "torch.aten.rsqrt"(%36907) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36908, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36909 = "torch.aten.mul.Tensor"(%36897, %36908) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36909, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36910 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36911 = "torch.prims.convert_element_type"(%36909, %36910) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36911, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36912 = "torch.aten.mul.Tensor"(%18295, %36911) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36912, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36913 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36914 = "torch.prims.convert_element_type"(%36912, %36913) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36914, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36915 = "torch.aten.div.Tensor"(%36914, %18297) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36915, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36916 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36917 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36918 = "torch.aten.clamp"(%36915, %36916, %36917) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36918, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36919 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36920 = "torch.prims.convert_element_type"(%36918, %36919) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36920, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36921 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36922 = "torch.aten.unsqueeze"(%18299, %36921) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %36923 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36924 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %36925 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36926 = "torch.prim.ListConstruct"(%36923, %36924, %36925) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36927 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36928 = "torch.aten.expand"(%36922, %36926, %36927) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %36929 = "torch_c.to_builtin_tensor"(%36920) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36930 = "torch_c.to_builtin_tensor"(%36928) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %36931 = "util.call"(%36929, %36930) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %36932 = "torch_c.from_builtin_tensor"(%36931) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%36932, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %36933 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36934 = "torch.prims.convert_element_type"(%36932, %36933) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36934, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36935 = "torch.aten.silu"(%36934) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36935, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36936 = "torch.aten.div.Tensor"(%36914, %18301) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36936, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36937 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36938 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36939 = "torch.aten.clamp"(%36936, %36937, %36938) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36939, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36940 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36941 = "torch.prims.convert_element_type"(%36939, %36940) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36941, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %36942 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36943 = "torch.aten.unsqueeze"(%18303, %36942) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %36944 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36945 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %36946 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36947 = "torch.prim.ListConstruct"(%36944, %36945, %36946) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36948 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36949 = "torch.aten.expand"(%36943, %36947, %36948) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %36950 = "torch_c.to_builtin_tensor"(%36941) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %36951 = "torch_c.to_builtin_tensor"(%36949) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %36952 = "util.call"(%36950, %36951) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %36953 = "torch_c.from_builtin_tensor"(%36952) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%36953, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %36954 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36955 = "torch.prims.convert_element_type"(%36953, %36954) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36955, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36956 = "torch.aten.mul.Tensor"(%36935, %36955) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36956, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36957 = "torch.aten.div.Tensor"(%36956, %18305) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36957, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36958 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %36959 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %36960 = "torch.aten.clamp"(%36957, %36958, %36959) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%36960, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %36961 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %36962 = "torch.prims.convert_element_type"(%36960, %36961) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%36962, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %36963 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %36964 = "torch.aten.unsqueeze"(%18307, %36963) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %36965 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %36966 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %36967 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %36968 = "torch.prim.ListConstruct"(%36965, %36966, %36967) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %36969 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %36970 = "torch.aten.expand"(%36964, %36968, %36969) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %36971 = "torch_c.to_builtin_tensor"(%36962) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %36972 = "torch_c.to_builtin_tensor"(%36970) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %36973 = "util.call"(%36971, %36972) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %36974 = "torch_c.from_builtin_tensor"(%36973) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36974, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36975 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36976 = "torch.prims.convert_element_type"(%36974, %36975) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36976, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36977 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36978 = "torch.aten.add.Tensor"(%36895, %36976, %36977) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36978, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36979 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %36980 = "torch.prims.convert_element_type"(%36978, %36979) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36980, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36981 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %36982 = "torch.aten.pow.Tensor_Scalar"(%36980, %36981) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36982, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36983 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %36984 = "torch.prim.ListConstruct"(%36983) : (!torch.int) -> !torch.list<int>
    %36985 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %36986 = "torch.constant.none"() : () -> !torch.none
    %36987 = "torch.aten.mean.dim"(%36982, %36984, %36985, %36986) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36987, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36988 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %36989 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %36990 = "torch.aten.add.Scalar"(%36987, %36988, %36989) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36990, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36991 = "torch.aten.rsqrt"(%36990) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%36991, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %36992 = "torch.aten.mul.Tensor"(%36980, %36991) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%36992, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %36993 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36994 = "torch.prims.convert_element_type"(%36992, %36993) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36994, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36995 = "torch.aten.mul.Tensor"(%18309, %36994) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36995, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36996 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %36997 = "torch.prims.convert_element_type"(%36995, %36996) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36997, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36998 = "torch.aten.div.Tensor"(%36997, %18311) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%36998, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %36999 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37000 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37001 = "torch.aten.clamp"(%36998, %36999, %37000) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37001, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37002 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37003 = "torch.prims.convert_element_type"(%37001, %37002) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37003, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37004 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37005 = "torch.aten.unsqueeze"(%18313, %37004) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %37006 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37007 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37008 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37009 = "torch.prim.ListConstruct"(%37006, %37007, %37008) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37010 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37011 = "torch.aten.expand"(%37005, %37009, %37010) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %37012 = "torch_c.to_builtin_tensor"(%37003) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37013 = "torch_c.to_builtin_tensor"(%37011) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %37014 = "util.call"(%37012, %37013) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %37015 = "torch_c.from_builtin_tensor"(%37014) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37015, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37016 = "torch.aten.div.Tensor"(%37015, %18315) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37016, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37017 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37018 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37019 = "torch.aten.clamp"(%37016, %37017, %37018) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37019, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37020 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37021 = "torch.prims.convert_element_type"(%37019, %37020) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37021, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37022 = "torch.aten.div.Tensor"(%36997, %18317) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37022, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37023 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37024 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37025 = "torch.aten.clamp"(%37022, %37023, %37024) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37025, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37026 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37027 = "torch.prims.convert_element_type"(%37025, %37026) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37027, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37028 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37029 = "torch.aten.unsqueeze"(%18319, %37028) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %37030 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37031 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %37032 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37033 = "torch.prim.ListConstruct"(%37030, %37031, %37032) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37034 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37035 = "torch.aten.expand"(%37029, %37033, %37034) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %37036 = "torch_c.to_builtin_tensor"(%37027) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37037 = "torch_c.to_builtin_tensor"(%37035) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %37038 = "util.call"(%37036, %37037) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %37039 = "torch_c.from_builtin_tensor"(%37038) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37039, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37040 = "torch.aten.div.Tensor"(%37039, %18321) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37040, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37041 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37042 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37043 = "torch.aten.clamp"(%37040, %37041, %37042) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37043, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37044 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37045 = "torch.prims.convert_element_type"(%37043, %37044) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37045, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %37046 = "torch.aten.div.Tensor"(%36997, %18323) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37046, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37047 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37048 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37049 = "torch.aten.clamp"(%37046, %37047, %37048) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37049, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37050 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37051 = "torch.prims.convert_element_type"(%37049, %37050) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37051, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37052 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37053 = "torch.aten.unsqueeze"(%18325, %37052) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %37054 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37055 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %37056 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37057 = "torch.prim.ListConstruct"(%37054, %37055, %37056) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37058 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37059 = "torch.aten.expand"(%37053, %37057, %37058) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %37060 = "torch_c.to_builtin_tensor"(%37051) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37061 = "torch_c.to_builtin_tensor"(%37059) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %37062 = "util.call"(%37060, %37061) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %37063 = "torch_c.from_builtin_tensor"(%37062) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37063, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37064 = "torch.aten.div.Tensor"(%37063, %18327) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37064, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37065 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37066 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37067 = "torch.aten.clamp"(%37064, %37065, %37066) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37067, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37068 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37069 = "torch.prims.convert_element_type"(%37067, %37068) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37069, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %37070 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37071 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37072 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37073 = "torch.prim.ListConstruct"(%37070, %18481, %37071, %37072) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37074 = "torch.aten.view"(%37021, %37073) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37074, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37075 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37076 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37077 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37078 = "torch.prim.ListConstruct"(%37075, %18481, %37076, %37077) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37079 = "torch.aten.view"(%37045, %37078) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37079, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37080 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37081 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37082 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37083 = "torch.prim.ListConstruct"(%37080, %18481, %37081, %37082) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37084 = "torch.aten.view"(%37069, %37083) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37084, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37085 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37086 = "torch.constant.none"() : () -> !torch.none
    %37087 = "torch.constant.none"() : () -> !torch.none
    %37088 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37089 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37090 = "torch.aten.arange"(%37085, %37086, %37087, %37088, %37089) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %37091 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37092 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37093 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37094 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37095 = "torch.constant.none"() : () -> !torch.none
    %37096 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37097 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37098 = "torch.aten.arange.start_step"(%37091, %37092, %37093, %37094, %37095, %37096, %37097) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %37099 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37100 = "torch.prims.convert_element_type"(%37098, %37099) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %37101 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37102 = "torch.aten.div.Scalar"(%37100, %37101) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37103 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %37104 = "torch.aten.pow.Scalar"(%37103, %37102) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37105 = "torch.aten.reciprocal"(%37104) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37106 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %37107 = "torch.aten.mul.Scalar"(%37105, %37106) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37108 = "torch.aten.reciprocal"(%37107) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37109 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %37110 = "torch.aten.mul.Scalar"(%37108, %37109) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37111 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37112 = "torch.aten.gt.Scalar"(%37110, %37111) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37113 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37114 = "torch.aten.div.Scalar"(%37107, %37113) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37115 = "torch.aten.where.self"(%37112, %37114, %37107) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37116 = "torch.aten.reciprocal"(%37110) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37117 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %37118 = "torch.aten.mul.Scalar"(%37116, %37117) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37119 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37120 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37121 = "torch.aten.sub.Scalar"(%37118, %37119, %37120) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37122 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37123 = "torch.aten.div.Scalar"(%37121, %37122) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37124 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37125 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37126 = "torch.aten.rsub.Scalar"(%37123, %37124, %37125) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37127 = "torch.aten.mul.Tensor"(%37126, %37115) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37128 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37129 = "torch.aten.div.Scalar"(%37127, %37128) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37130 = "torch.aten.mul.Tensor"(%37123, %37115) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37131 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37132 = "torch.aten.add.Tensor"(%37129, %37130, %37131) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37133 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %37134 = "torch.aten.lt.Scalar"(%37110, %37133) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37135 = "torch.aten.bitwise_not"(%37134) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37136 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37137 = "torch.aten.gt.Scalar"(%37110, %37136) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37138 = "torch.aten.bitwise_not"(%37137) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37139 = "torch.aten.mul.Tensor"(%37135, %37138) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37140 = "torch.aten.where.self"(%37139, %37132, %37115) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37141 = "torch.prim.ListConstruct"(%37140, %37140) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %37142 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37143 = "torch.aten.cat"(%37141, %37142) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %37144 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37145 = "torch.prims.convert_element_type"(%37090, %37144) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %37146 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37147 = "torch.prims.convert_element_type"(%37143, %37146) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %37148 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37149 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37150 = "torch.prim.ListConstruct"(%37148, %37149) : (!torch.int, !torch.int) -> !torch.list<int>
    %37151 = "torch.aten.view"(%37145, %37150) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %37152 = "torch.aten.mul.Tensor"(%37151, %37147) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37153 = "torch.aten.cos"(%37152) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37154 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37155 = "torch.prims.convert_element_type"(%37153, %37154) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37156 = "torch.aten.sin"(%37152) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37157 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37158 = "torch.prims.convert_element_type"(%37156, %37157) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37159 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37160 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37161 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37162 = "torch.aten.slice.Tensor"(%37155, %37159, %37160, %18481, %37161) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37162, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37163 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37164 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37165 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37166 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37167 = "torch.aten.slice.Tensor"(%37162, %37163, %37164, %37165, %37166) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37167, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37168 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37169 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37170 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37171 = "torch.aten.slice.Tensor"(%37158, %37168, %37169, %18481, %37170) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37171, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37172 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37173 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37174 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37175 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37176 = "torch.aten.slice.Tensor"(%37171, %37172, %37173, %37174, %37175) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37176, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37177 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37178 = "torch.aten.unsqueeze"(%37167, %37177) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37178, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37179 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37180 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37181 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37183 = "torch.aten.slice.Tensor"(%37178, %37179, %37180, %37181, %37182) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37183, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37184 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37185 = "torch.aten.unsqueeze"(%37183, %37184) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37185, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37186 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37187 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37188 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37189 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37190 = "torch.aten.slice.Tensor"(%37185, %37186, %37187, %37188, %37189) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37190, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37191 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37193 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37194 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37195 = "torch.prim.ListConstruct"(%37191, %37192, %37193, %37194) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37196 = "torch.aten.repeat"(%37190, %37195) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37196, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %37197 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37198 = "torch.aten.unsqueeze"(%37176, %37197) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37198, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37199 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37200 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37201 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37202 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37203 = "torch.aten.slice.Tensor"(%37198, %37199, %37200, %37201, %37202) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37203, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37204 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37205 = "torch.aten.unsqueeze"(%37203, %37204) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37205, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37206 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37207 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37208 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37210 = "torch.aten.slice.Tensor"(%37205, %37206, %37207, %37208, %37209) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37210, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37211 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37212 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37213 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37215 = "torch.prim.ListConstruct"(%37211, %37212, %37213, %37214) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37216 = "torch.aten.repeat"(%37210, %37215) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37216, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %37217 = "torch.aten.mul.Tensor"(%37074, %37196) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37217, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37218 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37219 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37220 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %37221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37222 = "torch.aten.slice.Tensor"(%37074, %37218, %37219, %37220, %37221) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37222, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37223 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37224 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %37225 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37226 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37227 = "torch.aten.slice.Tensor"(%37074, %37223, %37224, %37225, %37226) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37227, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37228 = "torch.aten.neg"(%37227) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37228, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37229 = "torch.prim.ListConstruct"(%37228, %37222) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %37230 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37231 = "torch.aten.cat"(%37229, %37230) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37231, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37232 = "torch.aten.mul.Tensor"(%37231, %37216) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37232, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37233 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37234 = "torch.aten.add.Tensor"(%37217, %37232, %37233) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37234, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37235 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37236 = "torch.constant.none"() : () -> !torch.none
    %37237 = "torch.constant.none"() : () -> !torch.none
    %37238 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37239 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37240 = "torch.aten.arange"(%37235, %37236, %37237, %37238, %37239) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %37241 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37242 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37243 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37244 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37245 = "torch.constant.none"() : () -> !torch.none
    %37246 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37247 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37248 = "torch.aten.arange.start_step"(%37241, %37242, %37243, %37244, %37245, %37246, %37247) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %37249 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37250 = "torch.prims.convert_element_type"(%37248, %37249) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %37251 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37252 = "torch.aten.div.Scalar"(%37250, %37251) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37253 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %37254 = "torch.aten.pow.Scalar"(%37253, %37252) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37255 = "torch.aten.reciprocal"(%37254) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37256 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %37257 = "torch.aten.mul.Scalar"(%37255, %37256) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37258 = "torch.aten.reciprocal"(%37257) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37259 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %37260 = "torch.aten.mul.Scalar"(%37258, %37259) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37261 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37262 = "torch.aten.gt.Scalar"(%37260, %37261) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37263 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37264 = "torch.aten.div.Scalar"(%37257, %37263) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37265 = "torch.aten.where.self"(%37262, %37264, %37257) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37266 = "torch.aten.reciprocal"(%37260) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37267 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %37268 = "torch.aten.mul.Scalar"(%37266, %37267) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37269 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37271 = "torch.aten.sub.Scalar"(%37268, %37269, %37270) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37272 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37273 = "torch.aten.div.Scalar"(%37271, %37272) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37274 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37275 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37276 = "torch.aten.rsub.Scalar"(%37273, %37274, %37275) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37277 = "torch.aten.mul.Tensor"(%37276, %37265) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37278 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37279 = "torch.aten.div.Scalar"(%37277, %37278) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37280 = "torch.aten.mul.Tensor"(%37273, %37265) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37281 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37282 = "torch.aten.add.Tensor"(%37279, %37280, %37281) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37283 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %37284 = "torch.aten.lt.Scalar"(%37260, %37283) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37285 = "torch.aten.bitwise_not"(%37284) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37286 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37287 = "torch.aten.gt.Scalar"(%37260, %37286) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37288 = "torch.aten.bitwise_not"(%37287) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37289 = "torch.aten.mul.Tensor"(%37285, %37288) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37290 = "torch.aten.where.self"(%37289, %37282, %37265) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37291 = "torch.prim.ListConstruct"(%37290, %37290) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %37292 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37293 = "torch.aten.cat"(%37291, %37292) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %37294 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37295 = "torch.prims.convert_element_type"(%37240, %37294) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %37296 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37297 = "torch.prims.convert_element_type"(%37293, %37296) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %37298 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37299 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37300 = "torch.prim.ListConstruct"(%37298, %37299) : (!torch.int, !torch.int) -> !torch.list<int>
    %37301 = "torch.aten.view"(%37295, %37300) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %37302 = "torch.aten.mul.Tensor"(%37301, %37297) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37303 = "torch.aten.cos"(%37302) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37304 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37305 = "torch.prims.convert_element_type"(%37303, %37304) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37306 = "torch.aten.sin"(%37302) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37307 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37308 = "torch.prims.convert_element_type"(%37306, %37307) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37309 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37310 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37311 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37312 = "torch.aten.slice.Tensor"(%37305, %37309, %37310, %18481, %37311) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37312, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37313 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37314 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37315 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37316 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37317 = "torch.aten.slice.Tensor"(%37312, %37313, %37314, %37315, %37316) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37317, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37318 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37319 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37320 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37321 = "torch.aten.slice.Tensor"(%37308, %37318, %37319, %18481, %37320) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37321, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37322 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37323 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37324 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37325 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37326 = "torch.aten.slice.Tensor"(%37321, %37322, %37323, %37324, %37325) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37326, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37327 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37328 = "torch.aten.unsqueeze"(%37317, %37327) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37328, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37329 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37330 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37331 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37332 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37333 = "torch.aten.slice.Tensor"(%37328, %37329, %37330, %37331, %37332) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37333, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37334 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37335 = "torch.aten.unsqueeze"(%37333, %37334) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37335, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37336 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37337 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37338 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37339 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37340 = "torch.aten.slice.Tensor"(%37335, %37336, %37337, %37338, %37339) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37340, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37341 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37342 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37343 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37344 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37345 = "torch.prim.ListConstruct"(%37341, %37342, %37343, %37344) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37346 = "torch.aten.repeat"(%37340, %37345) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37346, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %37347 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37348 = "torch.aten.unsqueeze"(%37326, %37347) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37348, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37349 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37350 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37351 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37352 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37353 = "torch.aten.slice.Tensor"(%37348, %37349, %37350, %37351, %37352) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37353, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37354 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37355 = "torch.aten.unsqueeze"(%37353, %37354) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37355, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37356 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37357 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37358 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37359 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37360 = "torch.aten.slice.Tensor"(%37355, %37356, %37357, %37358, %37359) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37360, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37361 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37362 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37363 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37364 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37365 = "torch.prim.ListConstruct"(%37361, %37362, %37363, %37364) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37366 = "torch.aten.repeat"(%37360, %37365) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37366, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %37367 = "torch.aten.mul.Tensor"(%37079, %37346) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37367, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37368 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37369 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37370 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %37371 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37372 = "torch.aten.slice.Tensor"(%37079, %37368, %37369, %37370, %37371) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37372, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37373 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37374 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %37375 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37376 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37377 = "torch.aten.slice.Tensor"(%37079, %37373, %37374, %37375, %37376) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37377, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37378 = "torch.aten.neg"(%37377) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37378, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37379 = "torch.prim.ListConstruct"(%37378, %37372) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %37380 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37381 = "torch.aten.cat"(%37379, %37380) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37381, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37382 = "torch.aten.mul.Tensor"(%37381, %37366) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37382, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37383 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37384 = "torch.aten.add.Tensor"(%37367, %37382, %37383) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37384, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37385 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %37386 = "torch.aten.mul.Scalar"(%arg69, %37385) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%37386, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %37387 = "torch.constant.int"() <{value = 56 : i64}> : () -> !torch.int
    %37388 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37389 = "torch.aten.add.Scalar"(%37386, %37387, %37388) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%37389, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %37390 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37391 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37392 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37393 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37394 = "torch.prim.ListConstruct"(%37390, %18477, %37391, %37392, %37393) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37395 = "torch.aten.view"(%37384, %37394) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37395, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37396 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37397 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37398 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37399 = "torch.prim.ListConstruct"(%19011, %37396, %37397, %37398) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37400 = "torch.aten.view"(%37395, %37399) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37400, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37401 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %37402 = "torch.aten.view"(%37389, %37401) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%37402, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %37403 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37404 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37405 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37406 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37407 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37408 = "torch.prim.ListConstruct"(%18479, %37403, %37404, %37405, %37406, %37407) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37409 = "torch.aten.view"(%36811, %37408) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37409, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37410 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37411 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37412 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37413 = "torch.prim.ListConstruct"(%18993, %37410, %37411, %37412) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37414 = "torch.aten.view"(%37409, %37413) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37414, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37415 = "torch.prim.ListConstruct"(%37402) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %37416 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37417 = "torch.aten.index_put"(%37414, %37415, %37400, %37416) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37417, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37418 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37419 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37420 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37421 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37422 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37423 = "torch.prim.ListConstruct"(%18479, %37418, %37419, %37420, %37421, %37422) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37424 = "torch.aten.view"(%37417, %37423) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37424, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37425 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %37426 = "torch.prim.ListConstruct"(%18479, %37425) : (!torch.int, !torch.int) -> !torch.list<int>
    %37427 = "torch.aten.view"(%37424, %37426) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37427, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %37428 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37429 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37430 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37431 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37432 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37433 = "torch.prim.ListConstruct"(%18479, %37428, %37429, %37430, %37431, %37432) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37434 = "torch.aten.view"(%37427, %37433) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37434, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37435 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37436 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37437 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37438 = "torch.prim.ListConstruct"(%18993, %37435, %37436, %37437) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37439 = "torch.aten.view"(%37434, %37438) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37439, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37440 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37441 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37442 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37443 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37444 = "torch.prim.ListConstruct"(%37440, %18477, %37441, %37442, %37443) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37445 = "torch.aten.view"(%37084, %37444) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37445, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37446 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37447 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37448 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37449 = "torch.prim.ListConstruct"(%19011, %37446, %37447, %37448) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37450 = "torch.aten.view"(%37445, %37449) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37450, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37451 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37452 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37453 = "torch.aten.add.Scalar"(%37389, %37451, %37452) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%37453, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %37454 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %37455 = "torch.aten.view"(%37453, %37454) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%37455, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %37456 = "torch.prim.ListConstruct"(%37455) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %37457 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37458 = "torch.aten.index_put"(%37439, %37456, %37450, %37457) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37458, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37459 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37460 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37461 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37462 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37463 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37464 = "torch.prim.ListConstruct"(%18479, %37459, %37460, %37461, %37462, %37463) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37465 = "torch.aten.view"(%37458, %37464) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37465, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37466 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %37467 = "torch.prim.ListConstruct"(%18479, %37466) : (!torch.int, !torch.int) -> !torch.list<int>
    %37468 = "torch.aten.view"(%37465, %37467) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37468, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %37469 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %37470 = "torch.aten.unsqueeze"(%37384, %37469) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37470, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37471 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37472 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37473 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37474 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37475 = "torch.prim.ListConstruct"(%37471, %18481, %37472, %37473, %37474) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37476 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37477 = "torch.aten.expand"(%37470, %37475, %37476) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37477, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37478 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37479 = "torch.aten.clone"(%37477, %37478) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37479, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37480 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37481 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37482 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37483 = "torch.prim.ListConstruct"(%37480, %18481, %37481, %37482) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37484 = "torch.aten._unsafe_view"(%37479, %37483) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37484, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37485 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %37486 = "torch.aten.unsqueeze"(%37084, %37485) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37486, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37487 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37488 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37489 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37490 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37491 = "torch.prim.ListConstruct"(%37487, %18481, %37488, %37489, %37490) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37492 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37493 = "torch.aten.expand"(%37486, %37491, %37492) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37493, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37494 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37495 = "torch.aten.clone"(%37493, %37494) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37495, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37496 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37497 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37498 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37499 = "torch.prim.ListConstruct"(%37496, %18481, %37497, %37498) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37500 = "torch.aten._unsafe_view"(%37495, %37499) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37500, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37501 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37502 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37503 = "torch.aten.transpose.int"(%37234, %37501, %37502) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37503, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37504 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37505 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37506 = "torch.aten.transpose.int"(%37484, %37504, %37505) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37506, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37507 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37508 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37509 = "torch.aten.transpose.int"(%37500, %37507, %37508) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37509, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37510 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37511 = "torch.aten.squeeze.dim"(%18570, %37510) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37511, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %37512 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37513 = "torch.aten.squeeze.dim"(%37511, %37512) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37513, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %37514 = "torch_c.to_builtin_tensor"(%37503) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %37515 = "torch_c.to_builtin_tensor"(%37506) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %37516 = "torch_c.to_builtin_tensor"(%37509) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %37517 = "torch_c.to_builtin_tensor"(%37513) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %37518 = "tensor.cast"(%37517) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %37519 = "torch_c.to_builtin_tensor"(%18329) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %37520 = "util.call"(%37514, %37515, %37516, %37519, %37518) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %37521 = "torch_c.from_builtin_tensor"(%37520) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%37521, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %37522 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37523 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37524 = "torch.aten.transpose.int"(%37521, %37522, %37523) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%37524, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %37525 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37526 = "torch.aten.clone"(%37524, %37525) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%37526, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %37527 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37528 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37529 = "torch.prim.ListConstruct"(%37527, %18481, %37528) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37530 = "torch.aten._unsafe_view"(%37526, %37529) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37530, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37531 = "torch.aten.div.Tensor"(%37530, %18331) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37531, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37532 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37533 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37534 = "torch.aten.clamp"(%37531, %37532, %37533) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37534, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37535 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37536 = "torch.prims.convert_element_type"(%37534, %37535) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37536, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37537 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37538 = "torch.aten.unsqueeze"(%18333, %37537) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %37539 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37540 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37541 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37542 = "torch.prim.ListConstruct"(%37539, %37540, %37541) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37543 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37544 = "torch.aten.expand"(%37538, %37542, %37543) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %37545 = "torch_c.to_builtin_tensor"(%37536) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37546 = "torch_c.to_builtin_tensor"(%37544) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %37547 = "util.call"(%37545, %37546) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %37548 = "torch_c.from_builtin_tensor"(%37547) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37548, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37549 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37550 = "torch.prims.convert_element_type"(%37548, %37549) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37550, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37551 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37552 = "torch.aten.add.Tensor"(%36978, %37550, %37551) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37552, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37553 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37554 = "torch.prims.convert_element_type"(%37552, %37553) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37554, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37555 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37556 = "torch.aten.pow.Tensor_Scalar"(%37554, %37555) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37556, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37557 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37558 = "torch.prim.ListConstruct"(%37557) : (!torch.int) -> !torch.list<int>
    %37559 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %37560 = "torch.constant.none"() : () -> !torch.none
    %37561 = "torch.aten.mean.dim"(%37556, %37558, %37559, %37560) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%37561, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %37562 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %37563 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37564 = "torch.aten.add.Scalar"(%37561, %37562, %37563) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%37564, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %37565 = "torch.aten.rsqrt"(%37564) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%37565, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %37566 = "torch.aten.mul.Tensor"(%37554, %37565) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37566, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37567 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37568 = "torch.prims.convert_element_type"(%37566, %37567) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37568, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37569 = "torch.aten.mul.Tensor"(%18335, %37568) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37569, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37570 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37571 = "torch.prims.convert_element_type"(%37569, %37570) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37571, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37572 = "torch.aten.div.Tensor"(%37571, %18337) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37572, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37573 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37574 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37575 = "torch.aten.clamp"(%37572, %37573, %37574) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37575, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37576 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37577 = "torch.prims.convert_element_type"(%37575, %37576) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37577, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37578 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37579 = "torch.aten.unsqueeze"(%18339, %37578) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %37580 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37581 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %37582 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37583 = "torch.prim.ListConstruct"(%37580, %37581, %37582) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37584 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37585 = "torch.aten.expand"(%37579, %37583, %37584) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %37586 = "torch_c.to_builtin_tensor"(%37577) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37587 = "torch_c.to_builtin_tensor"(%37585) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %37588 = "util.call"(%37586, %37587) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %37589 = "torch_c.from_builtin_tensor"(%37588) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%37589, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %37590 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37591 = "torch.prims.convert_element_type"(%37589, %37590) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%37591, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %37592 = "torch.aten.silu"(%37591) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%37592, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %37593 = "torch.aten.div.Tensor"(%37571, %18341) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37593, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37594 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37595 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37596 = "torch.aten.clamp"(%37593, %37594, %37595) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37596, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37597 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37598 = "torch.prims.convert_element_type"(%37596, %37597) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37598, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37599 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37600 = "torch.aten.unsqueeze"(%18343, %37599) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %37601 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37602 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %37603 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37604 = "torch.prim.ListConstruct"(%37601, %37602, %37603) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37605 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37606 = "torch.aten.expand"(%37600, %37604, %37605) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %37607 = "torch_c.to_builtin_tensor"(%37598) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37608 = "torch_c.to_builtin_tensor"(%37606) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %37609 = "util.call"(%37607, %37608) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %37610 = "torch_c.from_builtin_tensor"(%37609) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%37610, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %37611 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37612 = "torch.prims.convert_element_type"(%37610, %37611) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%37612, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %37613 = "torch.aten.mul.Tensor"(%37592, %37612) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%37613, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %37614 = "torch.aten.div.Tensor"(%37613, %18345) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%37614, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %37615 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37616 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37617 = "torch.aten.clamp"(%37614, %37615, %37616) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%37617, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %37618 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37619 = "torch.prims.convert_element_type"(%37617, %37618) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37619, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %37620 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37621 = "torch.aten.unsqueeze"(%18347, %37620) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %37622 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37623 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37624 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %37625 = "torch.prim.ListConstruct"(%37622, %37623, %37624) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37626 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37627 = "torch.aten.expand"(%37621, %37625, %37626) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %37628 = "torch_c.to_builtin_tensor"(%37619) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %37629 = "torch_c.to_builtin_tensor"(%37627) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %37630 = "util.call"(%37628, %37629) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %37631 = "torch_c.from_builtin_tensor"(%37630) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37631, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37632 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37633 = "torch.prims.convert_element_type"(%37631, %37632) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37633, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37634 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37635 = "torch.aten.add.Tensor"(%37552, %37633, %37634) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37635, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37636 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37637 = "torch.prims.convert_element_type"(%37635, %37636) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37637, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37638 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37639 = "torch.aten.pow.Tensor_Scalar"(%37637, %37638) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37639, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37640 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37641 = "torch.prim.ListConstruct"(%37640) : (!torch.int) -> !torch.list<int>
    %37642 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %37643 = "torch.constant.none"() : () -> !torch.none
    %37644 = "torch.aten.mean.dim"(%37639, %37641, %37642, %37643) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%37644, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %37645 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %37646 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37647 = "torch.aten.add.Scalar"(%37644, %37645, %37646) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%37647, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %37648 = "torch.aten.rsqrt"(%37647) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%37648, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %37649 = "torch.aten.mul.Tensor"(%37637, %37648) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37649, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37650 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37651 = "torch.prims.convert_element_type"(%37649, %37650) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37651, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37652 = "torch.aten.mul.Tensor"(%18349, %37651) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37652, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37653 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37654 = "torch.prims.convert_element_type"(%37652, %37653) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37654, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37655 = "torch.aten.div.Tensor"(%37654, %18351) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37655, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37656 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37657 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37658 = "torch.aten.clamp"(%37655, %37656, %37657) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37658, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37659 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37660 = "torch.prims.convert_element_type"(%37658, %37659) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37660, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37661 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37662 = "torch.aten.unsqueeze"(%18353, %37661) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %37663 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37664 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37665 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37666 = "torch.prim.ListConstruct"(%37663, %37664, %37665) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37667 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37668 = "torch.aten.expand"(%37662, %37666, %37667) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %37669 = "torch_c.to_builtin_tensor"(%37660) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37670 = "torch_c.to_builtin_tensor"(%37668) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %37671 = "util.call"(%37669, %37670) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %37672 = "torch_c.from_builtin_tensor"(%37671) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37672, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37673 = "torch.aten.div.Tensor"(%37672, %18355) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37673, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37674 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37675 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37676 = "torch.aten.clamp"(%37673, %37674, %37675) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%37676, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %37677 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37678 = "torch.prims.convert_element_type"(%37676, %37677) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37678, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37679 = "torch.aten.div.Tensor"(%37654, %18357) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37679, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37680 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37681 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37682 = "torch.aten.clamp"(%37679, %37680, %37681) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37682, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37683 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37684 = "torch.prims.convert_element_type"(%37682, %37683) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37684, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37685 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37686 = "torch.aten.unsqueeze"(%18359, %37685) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %37687 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37688 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %37689 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37690 = "torch.prim.ListConstruct"(%37687, %37688, %37689) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37691 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37692 = "torch.aten.expand"(%37686, %37690, %37691) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %37693 = "torch_c.to_builtin_tensor"(%37684) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37694 = "torch_c.to_builtin_tensor"(%37692) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %37695 = "util.call"(%37693, %37694) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %37696 = "torch_c.from_builtin_tensor"(%37695) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37696, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37697 = "torch.aten.div.Tensor"(%37696, %18361) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37697, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37698 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37699 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37700 = "torch.aten.clamp"(%37697, %37698, %37699) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37700, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37701 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37702 = "torch.prims.convert_element_type"(%37700, %37701) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37702, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %37703 = "torch.aten.div.Tensor"(%37654, %18363) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37703, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37704 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37705 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37706 = "torch.aten.clamp"(%37703, %37704, %37705) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%37706, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %37707 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37708 = "torch.prims.convert_element_type"(%37706, %37707) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37708, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %37709 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37710 = "torch.aten.unsqueeze"(%18365, %37709) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %37711 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37712 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %37713 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %37714 = "torch.prim.ListConstruct"(%37711, %37712, %37713) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37715 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37716 = "torch.aten.expand"(%37710, %37714, %37715) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %37717 = "torch_c.to_builtin_tensor"(%37708) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %37718 = "torch_c.to_builtin_tensor"(%37716) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %37719 = "util.call"(%37717, %37718) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %37720 = "torch_c.from_builtin_tensor"(%37719) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37720, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37721 = "torch.aten.div.Tensor"(%37720, %18367) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37721, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37722 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %37723 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %37724 = "torch.aten.clamp"(%37721, %37722, %37723) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%37724, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %37725 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %37726 = "torch.prims.convert_element_type"(%37724, %37725) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37726, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %37727 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37728 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %37729 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37730 = "torch.prim.ListConstruct"(%37727, %18481, %37728, %37729) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37731 = "torch.aten.view"(%37678, %37730) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37731, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37732 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37733 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37734 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37735 = "torch.prim.ListConstruct"(%37732, %18481, %37733, %37734) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37736 = "torch.aten.view"(%37702, %37735) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37736, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37737 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37738 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37739 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37740 = "torch.prim.ListConstruct"(%37737, %18481, %37738, %37739) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37741 = "torch.aten.view"(%37726, %37740) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37741, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37742 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37743 = "torch.constant.none"() : () -> !torch.none
    %37744 = "torch.constant.none"() : () -> !torch.none
    %37745 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37746 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37747 = "torch.aten.arange"(%37742, %37743, %37744, %37745, %37746) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %37748 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37749 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37750 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37751 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37752 = "torch.constant.none"() : () -> !torch.none
    %37753 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37754 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37755 = "torch.aten.arange.start_step"(%37748, %37749, %37750, %37751, %37752, %37753, %37754) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %37756 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37757 = "torch.prims.convert_element_type"(%37755, %37756) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %37758 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37759 = "torch.aten.div.Scalar"(%37757, %37758) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37760 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %37761 = "torch.aten.pow.Scalar"(%37760, %37759) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37762 = "torch.aten.reciprocal"(%37761) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37763 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %37764 = "torch.aten.mul.Scalar"(%37762, %37763) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37765 = "torch.aten.reciprocal"(%37764) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37766 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %37767 = "torch.aten.mul.Scalar"(%37765, %37766) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37768 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37769 = "torch.aten.gt.Scalar"(%37767, %37768) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37770 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37771 = "torch.aten.div.Scalar"(%37764, %37770) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37772 = "torch.aten.where.self"(%37769, %37771, %37764) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37773 = "torch.aten.reciprocal"(%37767) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37774 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %37775 = "torch.aten.mul.Scalar"(%37773, %37774) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37776 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37778 = "torch.aten.sub.Scalar"(%37775, %37776, %37777) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37779 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37780 = "torch.aten.div.Scalar"(%37778, %37779) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37781 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37782 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37783 = "torch.aten.rsub.Scalar"(%37780, %37781, %37782) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37784 = "torch.aten.mul.Tensor"(%37783, %37772) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37785 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37786 = "torch.aten.div.Scalar"(%37784, %37785) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37787 = "torch.aten.mul.Tensor"(%37780, %37772) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37788 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37789 = "torch.aten.add.Tensor"(%37786, %37787, %37788) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37790 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %37791 = "torch.aten.lt.Scalar"(%37767, %37790) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37792 = "torch.aten.bitwise_not"(%37791) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37793 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37794 = "torch.aten.gt.Scalar"(%37767, %37793) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37795 = "torch.aten.bitwise_not"(%37794) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37796 = "torch.aten.mul.Tensor"(%37792, %37795) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37797 = "torch.aten.where.self"(%37796, %37789, %37772) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37798 = "torch.prim.ListConstruct"(%37797, %37797) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %37799 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37800 = "torch.aten.cat"(%37798, %37799) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %37801 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37802 = "torch.prims.convert_element_type"(%37747, %37801) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %37803 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37804 = "torch.prims.convert_element_type"(%37800, %37803) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %37805 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37806 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37807 = "torch.prim.ListConstruct"(%37805, %37806) : (!torch.int, !torch.int) -> !torch.list<int>
    %37808 = "torch.aten.view"(%37802, %37807) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %37809 = "torch.aten.mul.Tensor"(%37808, %37804) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37810 = "torch.aten.cos"(%37809) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37811 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37812 = "torch.prims.convert_element_type"(%37810, %37811) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37813 = "torch.aten.sin"(%37809) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37814 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37815 = "torch.prims.convert_element_type"(%37813, %37814) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37816 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37817 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37818 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37819 = "torch.aten.slice.Tensor"(%37812, %37816, %37817, %18481, %37818) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37819, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37820 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37821 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37822 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37823 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37824 = "torch.aten.slice.Tensor"(%37819, %37820, %37821, %37822, %37823) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37824, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37825 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37826 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37827 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37828 = "torch.aten.slice.Tensor"(%37815, %37825, %37826, %18481, %37827) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37828, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37829 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37830 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37831 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37832 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37833 = "torch.aten.slice.Tensor"(%37828, %37829, %37830, %37831, %37832) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37833, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37834 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37835 = "torch.aten.unsqueeze"(%37824, %37834) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37835, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37836 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37837 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37838 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37839 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37840 = "torch.aten.slice.Tensor"(%37835, %37836, %37837, %37838, %37839) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37840, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37841 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37842 = "torch.aten.unsqueeze"(%37840, %37841) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37842, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37843 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37844 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37845 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37846 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37847 = "torch.aten.slice.Tensor"(%37842, %37843, %37844, %37845, %37846) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37847, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37848 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37849 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37850 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37851 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37852 = "torch.prim.ListConstruct"(%37848, %37849, %37850, %37851) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37853 = "torch.aten.repeat"(%37847, %37852) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37853, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %37854 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37855 = "torch.aten.unsqueeze"(%37833, %37854) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37855, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37857 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37858 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37859 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37860 = "torch.aten.slice.Tensor"(%37855, %37856, %37857, %37858, %37859) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37860, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37861 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37862 = "torch.aten.unsqueeze"(%37860, %37861) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37862, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37863 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37864 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37865 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37867 = "torch.aten.slice.Tensor"(%37862, %37863, %37864, %37865, %37866) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37867, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37868 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37869 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37870 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37871 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37872 = "torch.prim.ListConstruct"(%37868, %37869, %37870, %37871) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %37873 = "torch.aten.repeat"(%37867, %37872) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37873, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %37874 = "torch.aten.mul.Tensor"(%37731, %37853) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37874, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37875 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37876 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37877 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %37878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37879 = "torch.aten.slice.Tensor"(%37731, %37875, %37876, %37877, %37878) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37879, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37880 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37881 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %37882 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37883 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37884 = "torch.aten.slice.Tensor"(%37731, %37880, %37881, %37882, %37883) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37884, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37885 = "torch.aten.neg"(%37884) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37885, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %37886 = "torch.prim.ListConstruct"(%37885, %37879) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %37887 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37888 = "torch.aten.cat"(%37886, %37887) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37888, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37889 = "torch.aten.mul.Tensor"(%37888, %37873) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37889, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37890 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37891 = "torch.aten.add.Tensor"(%37874, %37889, %37890) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%37891, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %37892 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37893 = "torch.constant.none"() : () -> !torch.none
    %37894 = "torch.constant.none"() : () -> !torch.none
    %37895 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37896 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37897 = "torch.aten.arange"(%37892, %37893, %37894, %37895, %37896) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %37898 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37899 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37900 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37901 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37902 = "torch.constant.none"() : () -> !torch.none
    %37903 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %37904 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %37905 = "torch.aten.arange.start_step"(%37898, %37899, %37900, %37901, %37902, %37903, %37904) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %37906 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37907 = "torch.prims.convert_element_type"(%37905, %37906) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %37908 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %37909 = "torch.aten.div.Scalar"(%37907, %37908) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37910 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %37911 = "torch.aten.pow.Scalar"(%37910, %37909) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37912 = "torch.aten.reciprocal"(%37911) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37913 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %37914 = "torch.aten.mul.Scalar"(%37912, %37913) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37915 = "torch.aten.reciprocal"(%37914) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37916 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %37917 = "torch.aten.mul.Scalar"(%37915, %37916) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %37918 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37919 = "torch.aten.gt.Scalar"(%37917, %37918) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37920 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37921 = "torch.aten.div.Scalar"(%37914, %37920) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37922 = "torch.aten.where.self"(%37919, %37921, %37914) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37923 = "torch.aten.reciprocal"(%37917) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37924 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %37925 = "torch.aten.mul.Scalar"(%37923, %37924) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37926 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37928 = "torch.aten.sub.Scalar"(%37925, %37926, %37927) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37929 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37930 = "torch.aten.div.Scalar"(%37928, %37929) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37931 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37932 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37933 = "torch.aten.rsub.Scalar"(%37930, %37931, %37932) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %37934 = "torch.aten.mul.Tensor"(%37933, %37922) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37935 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %37936 = "torch.aten.div.Scalar"(%37934, %37935) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37937 = "torch.aten.mul.Tensor"(%37930, %37922) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37938 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37939 = "torch.aten.add.Tensor"(%37936, %37937, %37938) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %37940 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %37941 = "torch.aten.lt.Scalar"(%37917, %37940) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37942 = "torch.aten.bitwise_not"(%37941) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37943 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %37944 = "torch.aten.gt.Scalar"(%37917, %37943) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %37945 = "torch.aten.bitwise_not"(%37944) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37946 = "torch.aten.mul.Tensor"(%37942, %37945) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %37947 = "torch.aten.where.self"(%37946, %37939, %37922) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %37948 = "torch.prim.ListConstruct"(%37947, %37947) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %37949 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %37950 = "torch.aten.cat"(%37948, %37949) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %37951 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37952 = "torch.prims.convert_element_type"(%37897, %37951) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %37953 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %37954 = "torch.prims.convert_element_type"(%37950, %37953) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %37955 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %37956 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37957 = "torch.prim.ListConstruct"(%37955, %37956) : (!torch.int, !torch.int) -> !torch.list<int>
    %37958 = "torch.aten.view"(%37952, %37957) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %37959 = "torch.aten.mul.Tensor"(%37958, %37954) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37960 = "torch.aten.cos"(%37959) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37961 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37962 = "torch.prims.convert_element_type"(%37960, %37961) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37963 = "torch.aten.sin"(%37959) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %37964 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %37965 = "torch.prims.convert_element_type"(%37963, %37964) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %37966 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37967 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37968 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37969 = "torch.aten.slice.Tensor"(%37962, %37966, %37967, %18481, %37968) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37969, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37970 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37971 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37972 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37973 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37974 = "torch.aten.slice.Tensor"(%37969, %37970, %37971, %37972, %37973) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37974, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37975 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37976 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37977 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37978 = "torch.aten.slice.Tensor"(%37965, %37975, %37976, %18481, %37977) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37978, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37979 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37980 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37981 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37982 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37983 = "torch.aten.slice.Tensor"(%37978, %37979, %37980, %37981, %37982) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%37983, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %37984 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37985 = "torch.aten.unsqueeze"(%37974, %37984) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37985, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37986 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37987 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37988 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37989 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37990 = "torch.aten.slice.Tensor"(%37985, %37986, %37987, %37988, %37989) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%37990, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %37991 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %37992 = "torch.aten.unsqueeze"(%37990, %37991) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37992, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37993 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %37994 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %37995 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %37996 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %37997 = "torch.aten.slice.Tensor"(%37992, %37993, %37994, %37995, %37996) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%37997, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %37998 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %37999 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38000 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38001 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38002 = "torch.prim.ListConstruct"(%37998, %37999, %38000, %38001) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38003 = "torch.aten.repeat"(%37997, %38002) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38003, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %38004 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38005 = "torch.aten.unsqueeze"(%37983, %38004) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38005, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38006 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38007 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38008 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38009 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38010 = "torch.aten.slice.Tensor"(%38005, %38006, %38007, %38008, %38009) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38010, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38011 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38012 = "torch.aten.unsqueeze"(%38010, %38011) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38012, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38013 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38014 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38015 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38016 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38017 = "torch.aten.slice.Tensor"(%38012, %38013, %38014, %38015, %38016) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38017, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38018 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38019 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38020 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38021 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38022 = "torch.prim.ListConstruct"(%38018, %38019, %38020, %38021) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38023 = "torch.aten.repeat"(%38017, %38022) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38023, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %38024 = "torch.aten.mul.Tensor"(%37736, %38003) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38024, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38025 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38026 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38027 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38028 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38029 = "torch.aten.slice.Tensor"(%37736, %38025, %38026, %38027, %38028) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38029, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38030 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38031 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38032 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38033 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38034 = "torch.aten.slice.Tensor"(%37736, %38030, %38031, %38032, %38033) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38034, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38035 = "torch.aten.neg"(%38034) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38035, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38036 = "torch.prim.ListConstruct"(%38035, %38029) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %38037 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38038 = "torch.aten.cat"(%38036, %38037) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38038, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38039 = "torch.aten.mul.Tensor"(%38038, %38023) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38039, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38040 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38041 = "torch.aten.add.Tensor"(%38024, %38039, %38040) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38041, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38042 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38043 = "torch.aten.mul.Scalar"(%arg69, %38042) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%38043, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %38044 = "torch.constant.int"() <{value = 58 : i64}> : () -> !torch.int
    %38045 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38046 = "torch.aten.add.Scalar"(%38043, %38044, %38045) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%38046, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %38047 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38048 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38049 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38050 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38051 = "torch.prim.ListConstruct"(%38047, %18477, %38048, %38049, %38050) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38052 = "torch.aten.view"(%38041, %38051) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38052, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38053 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38054 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38055 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38056 = "torch.prim.ListConstruct"(%19011, %38053, %38054, %38055) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38057 = "torch.aten.view"(%38052, %38056) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38057, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38058 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %38059 = "torch.aten.view"(%38046, %38058) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%38059, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %38060 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38061 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38062 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38063 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38064 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38065 = "torch.prim.ListConstruct"(%18479, %38060, %38061, %38062, %38063, %38064) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38066 = "torch.aten.view"(%37468, %38065) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38066, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38067 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38068 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38069 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38070 = "torch.prim.ListConstruct"(%18993, %38067, %38068, %38069) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38071 = "torch.aten.view"(%38066, %38070) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38071, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38072 = "torch.prim.ListConstruct"(%38059) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %38073 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38074 = "torch.aten.index_put"(%38071, %38072, %38057, %38073) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38074, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38075 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38076 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38077 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38078 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38079 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38080 = "torch.prim.ListConstruct"(%18479, %38075, %38076, %38077, %38078, %38079) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38081 = "torch.aten.view"(%38074, %38080) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38081, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38082 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %38083 = "torch.prim.ListConstruct"(%18479, %38082) : (!torch.int, !torch.int) -> !torch.list<int>
    %38084 = "torch.aten.view"(%38081, %38083) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38084, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %38085 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38086 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38087 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38088 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38089 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38090 = "torch.prim.ListConstruct"(%18479, %38085, %38086, %38087, %38088, %38089) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38091 = "torch.aten.view"(%38084, %38090) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38091, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38092 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38093 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38094 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38095 = "torch.prim.ListConstruct"(%18993, %38092, %38093, %38094) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38096 = "torch.aten.view"(%38091, %38095) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38096, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38097 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38098 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38099 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38100 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38101 = "torch.prim.ListConstruct"(%38097, %18477, %38098, %38099, %38100) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38102 = "torch.aten.view"(%37741, %38101) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38102, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38103 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38104 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38105 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38106 = "torch.prim.ListConstruct"(%19011, %38103, %38104, %38105) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38107 = "torch.aten.view"(%38102, %38106) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38107, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38108 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38109 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38110 = "torch.aten.add.Scalar"(%38046, %38108, %38109) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%38110, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %38111 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %38112 = "torch.aten.view"(%38110, %38111) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%38112, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %38113 = "torch.prim.ListConstruct"(%38112) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %38114 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38115 = "torch.aten.index_put"(%38096, %38113, %38107, %38114) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38115, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38116 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38117 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38118 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38119 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38120 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38121 = "torch.prim.ListConstruct"(%18479, %38116, %38117, %38118, %38119, %38120) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38122 = "torch.aten.view"(%38115, %38121) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38122, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38123 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %38124 = "torch.prim.ListConstruct"(%18479, %38123) : (!torch.int, !torch.int) -> !torch.list<int>
    %38125 = "torch.aten.view"(%38122, %38124) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38125, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %38126 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %38127 = "torch.aten.unsqueeze"(%38041, %38126) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38127, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38128 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38129 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38130 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38131 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38132 = "torch.prim.ListConstruct"(%38128, %18481, %38129, %38130, %38131) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38133 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38134 = "torch.aten.expand"(%38127, %38132, %38133) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38134, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38135 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38136 = "torch.aten.clone"(%38134, %38135) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38136, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38137 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38138 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38139 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38140 = "torch.prim.ListConstruct"(%38137, %18481, %38138, %38139) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38141 = "torch.aten._unsafe_view"(%38136, %38140) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38141, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38142 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %38143 = "torch.aten.unsqueeze"(%37741, %38142) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38143, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38144 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38145 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38146 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38147 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38148 = "torch.prim.ListConstruct"(%38144, %18481, %38145, %38146, %38147) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38149 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38150 = "torch.aten.expand"(%38143, %38148, %38149) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38150, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38151 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38152 = "torch.aten.clone"(%38150, %38151) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38152, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38153 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38154 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38155 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38156 = "torch.prim.ListConstruct"(%38153, %18481, %38154, %38155) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38157 = "torch.aten._unsafe_view"(%38152, %38156) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38157, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38158 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38159 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38160 = "torch.aten.transpose.int"(%37891, %38158, %38159) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38160, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38161 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38162 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38163 = "torch.aten.transpose.int"(%38141, %38161, %38162) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38163, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38164 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38165 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38166 = "torch.aten.transpose.int"(%38157, %38164, %38165) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38166, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38167 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38168 = "torch.aten.squeeze.dim"(%18570, %38167) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38168, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %38169 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38170 = "torch.aten.squeeze.dim"(%38168, %38169) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38170, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %38171 = "torch_c.to_builtin_tensor"(%38160) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %38172 = "torch_c.to_builtin_tensor"(%38163) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %38173 = "torch_c.to_builtin_tensor"(%38166) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %38174 = "torch_c.to_builtin_tensor"(%38170) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %38175 = "tensor.cast"(%38174) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %38176 = "torch_c.to_builtin_tensor"(%18369) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %38177 = "util.call"(%38171, %38172, %38173, %38176, %38175) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %38178 = "torch_c.from_builtin_tensor"(%38177) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%38178, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %38179 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38180 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38181 = "torch.aten.transpose.int"(%38178, %38179, %38180) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%38181, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %38182 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38183 = "torch.aten.clone"(%38181, %38182) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%38183, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %38184 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38185 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38186 = "torch.prim.ListConstruct"(%38184, %18481, %38185) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38187 = "torch.aten._unsafe_view"(%38183, %38186) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38187, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38188 = "torch.aten.div.Tensor"(%38187, %18371) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38188, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38189 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38190 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38191 = "torch.aten.clamp"(%38188, %38189, %38190) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38191, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38192 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38193 = "torch.prims.convert_element_type"(%38191, %38192) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38193, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38194 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38195 = "torch.aten.unsqueeze"(%18373, %38194) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %38196 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38197 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38198 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38199 = "torch.prim.ListConstruct"(%38196, %38197, %38198) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38200 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38201 = "torch.aten.expand"(%38195, %38199, %38200) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %38202 = "torch_c.to_builtin_tensor"(%38193) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38203 = "torch_c.to_builtin_tensor"(%38201) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %38204 = "util.call"(%38202, %38203) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %38205 = "torch_c.from_builtin_tensor"(%38204) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38205, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38206 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38207 = "torch.prims.convert_element_type"(%38205, %38206) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38207, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38208 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38209 = "torch.aten.add.Tensor"(%37635, %38207, %38208) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38209, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38210 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38211 = "torch.prims.convert_element_type"(%38209, %38210) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38211, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38212 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38213 = "torch.aten.pow.Tensor_Scalar"(%38211, %38212) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38213, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38214 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38215 = "torch.prim.ListConstruct"(%38214) : (!torch.int) -> !torch.list<int>
    %38216 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %38217 = "torch.constant.none"() : () -> !torch.none
    %38218 = "torch.aten.mean.dim"(%38213, %38215, %38216, %38217) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38218, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38219 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %38220 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38221 = "torch.aten.add.Scalar"(%38218, %38219, %38220) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38221, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38222 = "torch.aten.rsqrt"(%38221) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38222, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38223 = "torch.aten.mul.Tensor"(%38211, %38222) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38223, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38224 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38225 = "torch.prims.convert_element_type"(%38223, %38224) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38225, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38226 = "torch.aten.mul.Tensor"(%18375, %38225) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38226, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38227 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38228 = "torch.prims.convert_element_type"(%38226, %38227) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38228, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38229 = "torch.aten.div.Tensor"(%38228, %18377) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38229, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38230 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38231 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38232 = "torch.aten.clamp"(%38229, %38230, %38231) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38232, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38233 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38234 = "torch.prims.convert_element_type"(%38232, %38233) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38234, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38235 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38236 = "torch.aten.unsqueeze"(%18379, %38235) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %38237 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38238 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %38239 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38240 = "torch.prim.ListConstruct"(%38237, %38238, %38239) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38241 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38242 = "torch.aten.expand"(%38236, %38240, %38241) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %38243 = "torch_c.to_builtin_tensor"(%38234) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38244 = "torch_c.to_builtin_tensor"(%38242) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %38245 = "util.call"(%38243, %38244) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %38246 = "torch_c.from_builtin_tensor"(%38245) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%38246, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %38247 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38248 = "torch.prims.convert_element_type"(%38246, %38247) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38248, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38249 = "torch.aten.silu"(%38248) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38249, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38250 = "torch.aten.div.Tensor"(%38228, %18381) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38250, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38251 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38252 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38253 = "torch.aten.clamp"(%38250, %38251, %38252) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38253, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38254 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38255 = "torch.prims.convert_element_type"(%38253, %38254) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38255, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38256 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38257 = "torch.aten.unsqueeze"(%18383, %38256) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %38258 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38259 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %38260 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38261 = "torch.prim.ListConstruct"(%38258, %38259, %38260) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38262 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38263 = "torch.aten.expand"(%38257, %38261, %38262) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %38264 = "torch_c.to_builtin_tensor"(%38255) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38265 = "torch_c.to_builtin_tensor"(%38263) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %38266 = "util.call"(%38264, %38265) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %38267 = "torch_c.from_builtin_tensor"(%38266) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%38267, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %38268 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38269 = "torch.prims.convert_element_type"(%38267, %38268) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38269, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38270 = "torch.aten.mul.Tensor"(%38249, %38269) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38270, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38271 = "torch.aten.div.Tensor"(%38270, %18385) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38271, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38272 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38273 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38274 = "torch.aten.clamp"(%38271, %38272, %38273) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38274, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38275 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38276 = "torch.prims.convert_element_type"(%38274, %38275) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38276, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %38277 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38278 = "torch.aten.unsqueeze"(%18387, %38277) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %38279 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38280 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38281 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %38282 = "torch.prim.ListConstruct"(%38279, %38280, %38281) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38283 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38284 = "torch.aten.expand"(%38278, %38282, %38283) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %38285 = "torch_c.to_builtin_tensor"(%38276) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %38286 = "torch_c.to_builtin_tensor"(%38284) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %38287 = "util.call"(%38285, %38286) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %38288 = "torch_c.from_builtin_tensor"(%38287) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38288, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38289 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38290 = "torch.prims.convert_element_type"(%38288, %38289) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38290, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38291 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38292 = "torch.aten.add.Tensor"(%38209, %38290, %38291) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38292, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38293 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38294 = "torch.prims.convert_element_type"(%38292, %38293) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38294, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38295 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38296 = "torch.aten.pow.Tensor_Scalar"(%38294, %38295) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38296, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38297 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38298 = "torch.prim.ListConstruct"(%38297) : (!torch.int) -> !torch.list<int>
    %38299 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %38300 = "torch.constant.none"() : () -> !torch.none
    %38301 = "torch.aten.mean.dim"(%38296, %38298, %38299, %38300) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38301, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38302 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %38303 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38304 = "torch.aten.add.Scalar"(%38301, %38302, %38303) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38304, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38305 = "torch.aten.rsqrt"(%38304) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38305, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38306 = "torch.aten.mul.Tensor"(%38294, %38305) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38306, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38307 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38308 = "torch.prims.convert_element_type"(%38306, %38307) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38308, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38309 = "torch.aten.mul.Tensor"(%18389, %38308) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38309, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38310 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38311 = "torch.prims.convert_element_type"(%38309, %38310) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38311, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38312 = "torch.aten.div.Tensor"(%38311, %18391) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38312, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38313 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38314 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38315 = "torch.aten.clamp"(%38312, %38313, %38314) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38315, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38316 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38317 = "torch.prims.convert_element_type"(%38315, %38316) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38317, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38318 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38319 = "torch.aten.unsqueeze"(%18393, %38318) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %38320 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38321 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38322 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38323 = "torch.prim.ListConstruct"(%38320, %38321, %38322) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38324 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38325 = "torch.aten.expand"(%38319, %38323, %38324) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %38326 = "torch_c.to_builtin_tensor"(%38317) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38327 = "torch_c.to_builtin_tensor"(%38325) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %38328 = "util.call"(%38326, %38327) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %38329 = "torch_c.from_builtin_tensor"(%38328) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38329, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38330 = "torch.aten.div.Tensor"(%38329, %18395) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38330, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38331 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38332 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38333 = "torch.aten.clamp"(%38330, %38331, %38332) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38333, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38334 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38335 = "torch.prims.convert_element_type"(%38333, %38334) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38335, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38336 = "torch.aten.div.Tensor"(%38311, %18397) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38336, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38337 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38338 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38339 = "torch.aten.clamp"(%38336, %38337, %38338) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38339, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38340 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38341 = "torch.prims.convert_element_type"(%38339, %38340) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38341, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38342 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38343 = "torch.aten.unsqueeze"(%18399, %38342) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %38344 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38345 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %38346 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38347 = "torch.prim.ListConstruct"(%38344, %38345, %38346) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38348 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38349 = "torch.aten.expand"(%38343, %38347, %38348) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %38350 = "torch_c.to_builtin_tensor"(%38341) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38351 = "torch_c.to_builtin_tensor"(%38349) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %38352 = "util.call"(%38350, %38351) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %38353 = "torch_c.from_builtin_tensor"(%38352) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%38353, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %38354 = "torch.aten.div.Tensor"(%38353, %18401) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%38354, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %38355 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38356 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38357 = "torch.aten.clamp"(%38354, %38355, %38356) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%38357, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %38358 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38359 = "torch.prims.convert_element_type"(%38357, %38358) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38359, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %38360 = "torch.aten.div.Tensor"(%38311, %18403) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38360, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38361 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38362 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38363 = "torch.aten.clamp"(%38360, %38361, %38362) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38363, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38364 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38365 = "torch.prims.convert_element_type"(%38363, %38364) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38365, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38366 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38367 = "torch.aten.unsqueeze"(%18405, %38366) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %38368 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38369 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %38370 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38371 = "torch.prim.ListConstruct"(%38368, %38369, %38370) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38372 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38373 = "torch.aten.expand"(%38367, %38371, %38372) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %38374 = "torch_c.to_builtin_tensor"(%38365) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38375 = "torch_c.to_builtin_tensor"(%38373) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %38376 = "util.call"(%38374, %38375) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %38377 = "torch_c.from_builtin_tensor"(%38376) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%38377, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %38378 = "torch.aten.div.Tensor"(%38377, %18407) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%38378, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %38379 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38380 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38381 = "torch.aten.clamp"(%38378, %38379, %38380) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%38381, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %38382 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38383 = "torch.prims.convert_element_type"(%38381, %38382) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38383, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %38384 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38385 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38386 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38387 = "torch.prim.ListConstruct"(%38384, %18481, %38385, %38386) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38388 = "torch.aten.view"(%38335, %38387) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38388, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38389 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38390 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38391 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38392 = "torch.prim.ListConstruct"(%38389, %18481, %38390, %38391) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38393 = "torch.aten.view"(%38359, %38392) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38393, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38394 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38395 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38396 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38397 = "torch.prim.ListConstruct"(%38394, %18481, %38395, %38396) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38398 = "torch.aten.view"(%38383, %38397) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38398, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38399 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %38400 = "torch.constant.none"() : () -> !torch.none
    %38401 = "torch.constant.none"() : () -> !torch.none
    %38402 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %38403 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38404 = "torch.aten.arange"(%38399, %38400, %38401, %38402, %38403) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %38405 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38406 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38407 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38408 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38409 = "torch.constant.none"() : () -> !torch.none
    %38410 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %38411 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38412 = "torch.aten.arange.start_step"(%38405, %38406, %38407, %38408, %38409, %38410, %38411) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %38413 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38414 = "torch.prims.convert_element_type"(%38412, %38413) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %38415 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38416 = "torch.aten.div.Scalar"(%38414, %38415) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38417 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %38418 = "torch.aten.pow.Scalar"(%38417, %38416) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38419 = "torch.aten.reciprocal"(%38418) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38420 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %38421 = "torch.aten.mul.Scalar"(%38419, %38420) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %38422 = "torch.aten.reciprocal"(%38421) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38423 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %38424 = "torch.aten.mul.Scalar"(%38422, %38423) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %38425 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %38426 = "torch.aten.gt.Scalar"(%38424, %38425) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %38427 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38428 = "torch.aten.div.Scalar"(%38421, %38427) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38429 = "torch.aten.where.self"(%38426, %38428, %38421) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38430 = "torch.aten.reciprocal"(%38424) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38431 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %38432 = "torch.aten.mul.Scalar"(%38430, %38431) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38433 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38434 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38435 = "torch.aten.sub.Scalar"(%38432, %38433, %38434) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %38436 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38437 = "torch.aten.div.Scalar"(%38435, %38436) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38438 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38439 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38440 = "torch.aten.rsub.Scalar"(%38437, %38438, %38439) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %38441 = "torch.aten.mul.Tensor"(%38440, %38429) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38442 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38443 = "torch.aten.div.Scalar"(%38441, %38442) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38444 = "torch.aten.mul.Tensor"(%38437, %38429) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38445 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38446 = "torch.aten.add.Tensor"(%38443, %38444, %38445) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38447 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %38448 = "torch.aten.lt.Scalar"(%38424, %38447) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %38449 = "torch.aten.bitwise_not"(%38448) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %38450 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %38451 = "torch.aten.gt.Scalar"(%38424, %38450) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %38452 = "torch.aten.bitwise_not"(%38451) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %38453 = "torch.aten.mul.Tensor"(%38449, %38452) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %38454 = "torch.aten.where.self"(%38453, %38446, %38429) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38455 = "torch.prim.ListConstruct"(%38454, %38454) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %38456 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38457 = "torch.aten.cat"(%38455, %38456) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %38458 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38459 = "torch.prims.convert_element_type"(%38404, %38458) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %38460 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38461 = "torch.prims.convert_element_type"(%38457, %38460) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %38462 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %38463 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38464 = "torch.prim.ListConstruct"(%38462, %38463) : (!torch.int, !torch.int) -> !torch.list<int>
    %38465 = "torch.aten.view"(%38459, %38464) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %38466 = "torch.aten.mul.Tensor"(%38465, %38461) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %38467 = "torch.aten.cos"(%38466) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %38468 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38469 = "torch.prims.convert_element_type"(%38467, %38468) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %38470 = "torch.aten.sin"(%38466) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %38471 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38472 = "torch.prims.convert_element_type"(%38470, %38471) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %38473 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38474 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38475 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38476 = "torch.aten.slice.Tensor"(%38469, %38473, %38474, %18481, %38475) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38476, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38477 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38478 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38479 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38480 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38481 = "torch.aten.slice.Tensor"(%38476, %38477, %38478, %38479, %38480) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38481, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38482 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38483 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38484 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38485 = "torch.aten.slice.Tensor"(%38472, %38482, %38483, %18481, %38484) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38485, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38486 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38487 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38488 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38489 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38490 = "torch.aten.slice.Tensor"(%38485, %38486, %38487, %38488, %38489) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38490, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38491 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38492 = "torch.aten.unsqueeze"(%38481, %38491) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38492, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38493 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38494 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38495 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38496 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38497 = "torch.aten.slice.Tensor"(%38492, %38493, %38494, %38495, %38496) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38497, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38498 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38499 = "torch.aten.unsqueeze"(%38497, %38498) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38499, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38500 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38501 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38502 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38503 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38504 = "torch.aten.slice.Tensor"(%38499, %38500, %38501, %38502, %38503) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38504, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38505 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38507 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38508 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38509 = "torch.prim.ListConstruct"(%38505, %38506, %38507, %38508) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38510 = "torch.aten.repeat"(%38504, %38509) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38510, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %38511 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38512 = "torch.aten.unsqueeze"(%38490, %38511) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38512, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38513 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38514 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38515 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38516 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38517 = "torch.aten.slice.Tensor"(%38512, %38513, %38514, %38515, %38516) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38517, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38518 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38519 = "torch.aten.unsqueeze"(%38517, %38518) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38519, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38520 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38522 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38523 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38524 = "torch.aten.slice.Tensor"(%38519, %38520, %38521, %38522, %38523) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38524, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38525 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38526 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38527 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38528 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38529 = "torch.prim.ListConstruct"(%38525, %38526, %38527, %38528) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38530 = "torch.aten.repeat"(%38524, %38529) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38530, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %38531 = "torch.aten.mul.Tensor"(%38388, %38510) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38531, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38532 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38533 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38534 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38536 = "torch.aten.slice.Tensor"(%38388, %38532, %38533, %38534, %38535) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38536, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38537 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38538 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38539 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38540 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38541 = "torch.aten.slice.Tensor"(%38388, %38537, %38538, %38539, %38540) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38541, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38542 = "torch.aten.neg"(%38541) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38542, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38543 = "torch.prim.ListConstruct"(%38542, %38536) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %38544 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38545 = "torch.aten.cat"(%38543, %38544) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38545, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38546 = "torch.aten.mul.Tensor"(%38545, %38530) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38546, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38547 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38548 = "torch.aten.add.Tensor"(%38531, %38546, %38547) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38548, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38549 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %38550 = "torch.constant.none"() : () -> !torch.none
    %38551 = "torch.constant.none"() : () -> !torch.none
    %38552 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %38553 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38554 = "torch.aten.arange"(%38549, %38550, %38551, %38552, %38553) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %38555 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38556 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38557 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38558 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38559 = "torch.constant.none"() : () -> !torch.none
    %38560 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %38561 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38562 = "torch.aten.arange.start_step"(%38555, %38556, %38557, %38558, %38559, %38560, %38561) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %38563 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38564 = "torch.prims.convert_element_type"(%38562, %38563) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %38565 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38566 = "torch.aten.div.Scalar"(%38564, %38565) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38567 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %38568 = "torch.aten.pow.Scalar"(%38567, %38566) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38569 = "torch.aten.reciprocal"(%38568) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38570 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %38571 = "torch.aten.mul.Scalar"(%38569, %38570) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %38572 = "torch.aten.reciprocal"(%38571) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38573 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %38574 = "torch.aten.mul.Scalar"(%38572, %38573) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %38575 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %38576 = "torch.aten.gt.Scalar"(%38574, %38575) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %38577 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38578 = "torch.aten.div.Scalar"(%38571, %38577) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38579 = "torch.aten.where.self"(%38576, %38578, %38571) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38580 = "torch.aten.reciprocal"(%38574) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38581 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %38582 = "torch.aten.mul.Scalar"(%38580, %38581) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38583 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38584 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38585 = "torch.aten.sub.Scalar"(%38582, %38583, %38584) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %38586 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38587 = "torch.aten.div.Scalar"(%38585, %38586) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38588 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38589 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38590 = "torch.aten.rsub.Scalar"(%38587, %38588, %38589) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %38591 = "torch.aten.mul.Tensor"(%38590, %38579) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38592 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38593 = "torch.aten.div.Scalar"(%38591, %38592) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38594 = "torch.aten.mul.Tensor"(%38587, %38579) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38595 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38596 = "torch.aten.add.Tensor"(%38593, %38594, %38595) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %38597 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %38598 = "torch.aten.lt.Scalar"(%38574, %38597) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %38599 = "torch.aten.bitwise_not"(%38598) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %38600 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %38601 = "torch.aten.gt.Scalar"(%38574, %38600) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %38602 = "torch.aten.bitwise_not"(%38601) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %38603 = "torch.aten.mul.Tensor"(%38599, %38602) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %38604 = "torch.aten.where.self"(%38603, %38596, %38579) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %38605 = "torch.prim.ListConstruct"(%38604, %38604) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %38606 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38607 = "torch.aten.cat"(%38605, %38606) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %38608 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38609 = "torch.prims.convert_element_type"(%38554, %38608) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %38610 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38611 = "torch.prims.convert_element_type"(%38607, %38610) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %38612 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %38613 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38614 = "torch.prim.ListConstruct"(%38612, %38613) : (!torch.int, !torch.int) -> !torch.list<int>
    %38615 = "torch.aten.view"(%38609, %38614) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %38616 = "torch.aten.mul.Tensor"(%38615, %38611) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %38617 = "torch.aten.cos"(%38616) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %38618 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38619 = "torch.prims.convert_element_type"(%38617, %38618) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %38620 = "torch.aten.sin"(%38616) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %38621 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38622 = "torch.prims.convert_element_type"(%38620, %38621) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %38623 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38624 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38625 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38626 = "torch.aten.slice.Tensor"(%38619, %38623, %38624, %18481, %38625) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38626, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38627 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38628 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38629 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38630 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38631 = "torch.aten.slice.Tensor"(%38626, %38627, %38628, %38629, %38630) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38631, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38632 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38633 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38634 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38635 = "torch.aten.slice.Tensor"(%38622, %38632, %38633, %18481, %38634) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38635, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38636 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38637 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38638 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38639 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38640 = "torch.aten.slice.Tensor"(%38635, %38636, %38637, %38638, %38639) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%38640, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %38641 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38642 = "torch.aten.unsqueeze"(%38631, %38641) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38642, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38643 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38644 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38645 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38646 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38647 = "torch.aten.slice.Tensor"(%38642, %38643, %38644, %38645, %38646) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38647, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38648 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38649 = "torch.aten.unsqueeze"(%38647, %38648) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38649, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38650 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38651 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38652 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38653 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38654 = "torch.aten.slice.Tensor"(%38649, %38650, %38651, %38652, %38653) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38654, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38655 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38656 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38657 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38658 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38659 = "torch.prim.ListConstruct"(%38655, %38656, %38657, %38658) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38660 = "torch.aten.repeat"(%38654, %38659) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38660, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %38661 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38662 = "torch.aten.unsqueeze"(%38640, %38661) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38662, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38663 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38664 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38665 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38667 = "torch.aten.slice.Tensor"(%38662, %38663, %38664, %38665, %38666) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%38667, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %38668 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38669 = "torch.aten.unsqueeze"(%38667, %38668) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38669, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38670 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38671 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38672 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38673 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38674 = "torch.aten.slice.Tensor"(%38669, %38670, %38671, %38672, %38673) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38674, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %38675 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38676 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38677 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38678 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38679 = "torch.prim.ListConstruct"(%38675, %38676, %38677, %38678) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38680 = "torch.aten.repeat"(%38674, %38679) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%38680, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %38681 = "torch.aten.mul.Tensor"(%38393, %38660) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38681, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38682 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38683 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38684 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38685 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38686 = "torch.aten.slice.Tensor"(%38393, %38682, %38683, %38684, %38685) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38686, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38687 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %38688 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38689 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %38690 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38691 = "torch.aten.slice.Tensor"(%38393, %38687, %38688, %38689, %38690) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38691, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38692 = "torch.aten.neg"(%38691) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38692, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %38693 = "torch.prim.ListConstruct"(%38692, %38686) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %38694 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38695 = "torch.aten.cat"(%38693, %38694) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38695, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38696 = "torch.aten.mul.Tensor"(%38695, %38680) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38696, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38697 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38698 = "torch.aten.add.Tensor"(%38681, %38696, %38697) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38698, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38699 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %38700 = "torch.aten.mul.Scalar"(%arg69, %38699) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%38700, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %38701 = "torch.constant.int"() <{value = 60 : i64}> : () -> !torch.int
    %38702 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38703 = "torch.aten.add.Scalar"(%38700, %38701, %38702) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%38703, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %38704 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38705 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38706 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38707 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38708 = "torch.prim.ListConstruct"(%38704, %18477, %38705, %38706, %38707) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38709 = "torch.aten.view"(%38698, %38708) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38709, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38710 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38711 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38712 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38713 = "torch.prim.ListConstruct"(%19011, %38710, %38711, %38712) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38714 = "torch.aten.view"(%38709, %38713) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38714, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38715 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %38716 = "torch.aten.view"(%38703, %38715) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%38716, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %38717 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38718 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38719 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38720 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38721 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38722 = "torch.prim.ListConstruct"(%18479, %38717, %38718, %38719, %38720, %38721) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38723 = "torch.aten.view"(%38125, %38722) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38723, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38724 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38725 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38726 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38727 = "torch.prim.ListConstruct"(%18993, %38724, %38725, %38726) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38728 = "torch.aten.view"(%38723, %38727) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38728, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38729 = "torch.prim.ListConstruct"(%38716) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %38730 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38731 = "torch.aten.index_put"(%38728, %38729, %38714, %38730) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38731, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38732 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38733 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38734 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38735 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38736 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38737 = "torch.prim.ListConstruct"(%18479, %38732, %38733, %38734, %38735, %38736) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38738 = "torch.aten.view"(%38731, %38737) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38738, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38739 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %38740 = "torch.prim.ListConstruct"(%18479, %38739) : (!torch.int, !torch.int) -> !torch.list<int>
    %38741 = "torch.aten.view"(%38738, %38740) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38741, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %38742 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38743 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38744 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38745 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38746 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38747 = "torch.prim.ListConstruct"(%18479, %38742, %38743, %38744, %38745, %38746) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38748 = "torch.aten.view"(%38741, %38747) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38748, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38749 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38750 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38751 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38752 = "torch.prim.ListConstruct"(%18993, %38749, %38750, %38751) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38753 = "torch.aten.view"(%38748, %38752) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38753, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38754 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38755 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38756 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38757 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38758 = "torch.prim.ListConstruct"(%38754, %18477, %38755, %38756, %38757) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38759 = "torch.aten.view"(%38398, %38758) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38759, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38760 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38761 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38762 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38763 = "torch.prim.ListConstruct"(%19011, %38760, %38761, %38762) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38764 = "torch.aten.view"(%38759, %38763) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38764, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38765 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38766 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38767 = "torch.aten.add.Scalar"(%38703, %38765, %38766) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%38767, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %38768 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %38769 = "torch.aten.view"(%38767, %38768) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%38769, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %38770 = "torch.prim.ListConstruct"(%38769) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %38771 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38772 = "torch.aten.index_put"(%38753, %38770, %38764, %38771) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38772, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38773 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38774 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38775 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38776 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38777 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38778 = "torch.prim.ListConstruct"(%18479, %38773, %38774, %38775, %38776, %38777) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38779 = "torch.aten.view"(%38772, %38778) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38779, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38780 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %38781 = "torch.prim.ListConstruct"(%18479, %38780) : (!torch.int, !torch.int) -> !torch.list<int>
    %38782 = "torch.aten.view"(%38779, %38781) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38782, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %38783 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %38784 = "torch.aten.unsqueeze"(%38698, %38783) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38784, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38785 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38786 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38787 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38788 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38789 = "torch.prim.ListConstruct"(%38785, %18481, %38786, %38787, %38788) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38790 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38791 = "torch.aten.expand"(%38784, %38789, %38790) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38791, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38792 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38793 = "torch.aten.clone"(%38791, %38792) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38793, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38794 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38795 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38796 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38797 = "torch.prim.ListConstruct"(%38794, %18481, %38795, %38796) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38798 = "torch.aten._unsafe_view"(%38793, %38797) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38798, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38799 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %38800 = "torch.aten.unsqueeze"(%38398, %38799) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38800, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38801 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38802 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %38803 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38804 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38805 = "torch.prim.ListConstruct"(%38801, %18481, %38802, %38803, %38804) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38806 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38807 = "torch.aten.expand"(%38800, %38805, %38806) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38807, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38808 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38809 = "torch.aten.clone"(%38807, %38808) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38809, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38810 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38811 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %38812 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %38813 = "torch.prim.ListConstruct"(%38810, %18481, %38811, %38812) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38814 = "torch.aten._unsafe_view"(%38809, %38813) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38814, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38815 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38816 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38817 = "torch.aten.transpose.int"(%38548, %38815, %38816) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38817, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38818 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38819 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38820 = "torch.aten.transpose.int"(%38798, %38818, %38819) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38820, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38821 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38822 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38823 = "torch.aten.transpose.int"(%38814, %38821, %38822) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38823, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %38824 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38825 = "torch.aten.squeeze.dim"(%18570, %38824) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38825, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %38826 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38827 = "torch.aten.squeeze.dim"(%38825, %38826) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38827, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %38828 = "torch_c.to_builtin_tensor"(%38817) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %38829 = "torch_c.to_builtin_tensor"(%38820) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %38830 = "torch_c.to_builtin_tensor"(%38823) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %38831 = "torch_c.to_builtin_tensor"(%38827) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %38832 = "tensor.cast"(%38831) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %38833 = "torch_c.to_builtin_tensor"(%18409) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %38834 = "util.call"(%38828, %38829, %38830, %38833, %38832) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %38835 = "torch_c.from_builtin_tensor"(%38834) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%38835, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %38836 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38837 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38838 = "torch.aten.transpose.int"(%38835, %38836, %38837) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%38838, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %38839 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38840 = "torch.aten.clone"(%38838, %38839) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%38840, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %38841 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38842 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38843 = "torch.prim.ListConstruct"(%38841, %18481, %38842) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38844 = "torch.aten._unsafe_view"(%38840, %38843) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38844, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38845 = "torch.aten.div.Tensor"(%38844, %18411) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38845, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38846 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38847 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38848 = "torch.aten.clamp"(%38845, %38846, %38847) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38848, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38849 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38850 = "torch.prims.convert_element_type"(%38848, %38849) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38850, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38851 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38852 = "torch.aten.unsqueeze"(%18413, %38851) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %38853 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38854 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38855 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38856 = "torch.prim.ListConstruct"(%38853, %38854, %38855) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38857 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38858 = "torch.aten.expand"(%38852, %38856, %38857) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %38859 = "torch_c.to_builtin_tensor"(%38850) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38860 = "torch_c.to_builtin_tensor"(%38858) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %38861 = "util.call"(%38859, %38860) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %38862 = "torch_c.from_builtin_tensor"(%38861) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38862, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38863 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38864 = "torch.prims.convert_element_type"(%38862, %38863) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38864, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38865 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38866 = "torch.aten.add.Tensor"(%38292, %38864, %38865) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38866, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38867 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38868 = "torch.prims.convert_element_type"(%38866, %38867) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38868, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38869 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38870 = "torch.aten.pow.Tensor_Scalar"(%38868, %38869) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38870, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38871 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38872 = "torch.prim.ListConstruct"(%38871) : (!torch.int) -> !torch.list<int>
    %38873 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %38874 = "torch.constant.none"() : () -> !torch.none
    %38875 = "torch.aten.mean.dim"(%38870, %38872, %38873, %38874) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38875, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38876 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %38877 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38878 = "torch.aten.add.Scalar"(%38875, %38876, %38877) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38878, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38879 = "torch.aten.rsqrt"(%38878) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38879, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38880 = "torch.aten.mul.Tensor"(%38868, %38879) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38880, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38881 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38882 = "torch.prims.convert_element_type"(%38880, %38881) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38882, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38883 = "torch.aten.mul.Tensor"(%18415, %38882) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38883, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38884 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38885 = "torch.prims.convert_element_type"(%38883, %38884) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38885, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38886 = "torch.aten.div.Tensor"(%38885, %18417) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38886, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38887 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38888 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38889 = "torch.aten.clamp"(%38886, %38887, %38888) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38889, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38890 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38891 = "torch.prims.convert_element_type"(%38889, %38890) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38891, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38892 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38893 = "torch.aten.unsqueeze"(%18419, %38892) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %38894 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38895 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %38896 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38897 = "torch.prim.ListConstruct"(%38894, %38895, %38896) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38898 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38899 = "torch.aten.expand"(%38893, %38897, %38898) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %38900 = "torch_c.to_builtin_tensor"(%38891) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38901 = "torch_c.to_builtin_tensor"(%38899) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %38902 = "util.call"(%38900, %38901) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %38903 = "torch_c.from_builtin_tensor"(%38902) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%38903, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %38904 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38905 = "torch.prims.convert_element_type"(%38903, %38904) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38905, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38906 = "torch.aten.silu"(%38905) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38906, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38907 = "torch.aten.div.Tensor"(%38885, %18421) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38907, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38908 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38909 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38910 = "torch.aten.clamp"(%38907, %38908, %38909) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38910, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38911 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38912 = "torch.prims.convert_element_type"(%38910, %38911) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38912, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38913 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38914 = "torch.aten.unsqueeze"(%18423, %38913) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %38915 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38916 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %38917 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38918 = "torch.prim.ListConstruct"(%38915, %38916, %38917) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38919 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38920 = "torch.aten.expand"(%38914, %38918, %38919) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %38921 = "torch_c.to_builtin_tensor"(%38912) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38922 = "torch_c.to_builtin_tensor"(%38920) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %38923 = "util.call"(%38921, %38922) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %38924 = "torch_c.from_builtin_tensor"(%38923) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%38924, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %38925 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38926 = "torch.prims.convert_element_type"(%38924, %38925) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38926, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38927 = "torch.aten.mul.Tensor"(%38906, %38926) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38927, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38928 = "torch.aten.div.Tensor"(%38927, %18425) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38928, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38929 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38930 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38931 = "torch.aten.clamp"(%38928, %38929, %38930) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%38931, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %38932 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38933 = "torch.prims.convert_element_type"(%38931, %38932) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38933, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %38934 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38935 = "torch.aten.unsqueeze"(%18427, %38934) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %38936 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38937 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38938 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %38939 = "torch.prim.ListConstruct"(%38936, %38937, %38938) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38940 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38941 = "torch.aten.expand"(%38935, %38939, %38940) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %38942 = "torch_c.to_builtin_tensor"(%38933) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %38943 = "torch_c.to_builtin_tensor"(%38941) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %38944 = "util.call"(%38942, %38943) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %38945 = "torch_c.from_builtin_tensor"(%38944) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38945, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38946 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38947 = "torch.prims.convert_element_type"(%38945, %38946) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38947, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38948 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38949 = "torch.aten.add.Tensor"(%38866, %38947, %38948) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38949, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38950 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %38951 = "torch.prims.convert_element_type"(%38949, %38950) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38951, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38952 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %38953 = "torch.aten.pow.Tensor_Scalar"(%38951, %38952) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38953, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38954 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %38955 = "torch.prim.ListConstruct"(%38954) : (!torch.int) -> !torch.list<int>
    %38956 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %38957 = "torch.constant.none"() : () -> !torch.none
    %38958 = "torch.aten.mean.dim"(%38953, %38955, %38956, %38957) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38958, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38959 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %38960 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %38961 = "torch.aten.add.Scalar"(%38958, %38959, %38960) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38961, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38962 = "torch.aten.rsqrt"(%38961) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%38962, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %38963 = "torch.aten.mul.Tensor"(%38951, %38962) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38963, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38964 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38965 = "torch.prims.convert_element_type"(%38963, %38964) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38965, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38966 = "torch.aten.mul.Tensor"(%18429, %38965) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38966, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38967 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %38968 = "torch.prims.convert_element_type"(%38966, %38967) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38968, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38969 = "torch.aten.div.Tensor"(%38968, %18431) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38969, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38970 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38971 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38972 = "torch.aten.clamp"(%38969, %38970, %38971) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38972, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38973 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38974 = "torch.prims.convert_element_type"(%38972, %38973) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38974, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38975 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %38976 = "torch.aten.unsqueeze"(%18433, %38975) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %38977 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %38978 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38979 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %38980 = "torch.prim.ListConstruct"(%38977, %38978, %38979) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %38981 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %38982 = "torch.aten.expand"(%38976, %38980, %38981) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %38983 = "torch_c.to_builtin_tensor"(%38974) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %38984 = "torch_c.to_builtin_tensor"(%38982) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %38985 = "util.call"(%38983, %38984) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %38986 = "torch_c.from_builtin_tensor"(%38985) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38986, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38987 = "torch.aten.div.Tensor"(%38986, %18435) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38987, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38988 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38989 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38990 = "torch.aten.clamp"(%38987, %38988, %38989) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%38990, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %38991 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38992 = "torch.prims.convert_element_type"(%38990, %38991) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38992, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38993 = "torch.aten.div.Tensor"(%38968, %18437) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38993, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38994 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %38995 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %38996 = "torch.aten.clamp"(%38993, %38994, %38995) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%38996, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %38997 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %38998 = "torch.prims.convert_element_type"(%38996, %38997) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%38998, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %38999 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39000 = "torch.aten.unsqueeze"(%18439, %38999) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %39001 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39002 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %39003 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39004 = "torch.prim.ListConstruct"(%39001, %39002, %39003) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39005 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39006 = "torch.aten.expand"(%39000, %39004, %39005) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %39007 = "torch_c.to_builtin_tensor"(%38998) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %39008 = "torch_c.to_builtin_tensor"(%39006) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %39009 = "util.call"(%39007, %39008) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %39010 = "torch_c.from_builtin_tensor"(%39009) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%39010, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %39011 = "torch.aten.div.Tensor"(%39010, %18441) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%39011, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %39012 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %39013 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %39014 = "torch.aten.clamp"(%39011, %39012, %39013) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%39014, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %39015 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %39016 = "torch.prims.convert_element_type"(%39014, %39015) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39016, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %39017 = "torch.aten.div.Tensor"(%38968, %18443) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39017, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39018 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %39019 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %39020 = "torch.aten.clamp"(%39017, %39018, %39019) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39020, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39021 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %39022 = "torch.prims.convert_element_type"(%39020, %39021) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39022, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %39023 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39024 = "torch.aten.unsqueeze"(%18445, %39023) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %39025 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39026 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %39027 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39028 = "torch.prim.ListConstruct"(%39025, %39026, %39027) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39029 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39030 = "torch.aten.expand"(%39024, %39028, %39029) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %39031 = "torch_c.to_builtin_tensor"(%39022) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %39032 = "torch_c.to_builtin_tensor"(%39030) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %39033 = "util.call"(%39031, %39032) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %39034 = "torch_c.from_builtin_tensor"(%39033) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%39034, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %39035 = "torch.aten.div.Tensor"(%39034, %18447) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%39035, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %39036 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %39037 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %39038 = "torch.aten.clamp"(%39035, %39036, %39037) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%39038, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %39039 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %39040 = "torch.prims.convert_element_type"(%39038, %39039) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39040, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %39041 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39042 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39043 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39044 = "torch.prim.ListConstruct"(%39041, %18481, %39042, %39043) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39045 = "torch.aten.view"(%38992, %39044) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39045, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39046 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39047 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39048 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39049 = "torch.prim.ListConstruct"(%39046, %18481, %39047, %39048) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39050 = "torch.aten.view"(%39016, %39049) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39050, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39051 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39052 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39053 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39054 = "torch.prim.ListConstruct"(%39051, %18481, %39052, %39053) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39055 = "torch.aten.view"(%39040, %39054) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39055, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39056 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %39057 = "torch.constant.none"() : () -> !torch.none
    %39058 = "torch.constant.none"() : () -> !torch.none
    %39059 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %39060 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39061 = "torch.aten.arange"(%39056, %39057, %39058, %39059, %39060) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %39062 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39063 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39064 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39065 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39066 = "torch.constant.none"() : () -> !torch.none
    %39067 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %39068 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39069 = "torch.aten.arange.start_step"(%39062, %39063, %39064, %39065, %39066, %39067, %39068) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %39070 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39071 = "torch.prims.convert_element_type"(%39069, %39070) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %39072 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39073 = "torch.aten.div.Scalar"(%39071, %39072) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39074 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %39075 = "torch.aten.pow.Scalar"(%39074, %39073) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39076 = "torch.aten.reciprocal"(%39075) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39077 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %39078 = "torch.aten.mul.Scalar"(%39076, %39077) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %39079 = "torch.aten.reciprocal"(%39078) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39080 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %39081 = "torch.aten.mul.Scalar"(%39079, %39080) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %39082 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %39083 = "torch.aten.gt.Scalar"(%39081, %39082) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %39084 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39085 = "torch.aten.div.Scalar"(%39078, %39084) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39086 = "torch.aten.where.self"(%39083, %39085, %39078) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39087 = "torch.aten.reciprocal"(%39081) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39088 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %39089 = "torch.aten.mul.Scalar"(%39087, %39088) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39090 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39091 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39092 = "torch.aten.sub.Scalar"(%39089, %39090, %39091) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %39093 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39094 = "torch.aten.div.Scalar"(%39092, %39093) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39095 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39096 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39097 = "torch.aten.rsub.Scalar"(%39094, %39095, %39096) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %39098 = "torch.aten.mul.Tensor"(%39097, %39086) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39099 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39100 = "torch.aten.div.Scalar"(%39098, %39099) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39101 = "torch.aten.mul.Tensor"(%39094, %39086) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39102 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39103 = "torch.aten.add.Tensor"(%39100, %39101, %39102) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39104 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %39105 = "torch.aten.lt.Scalar"(%39081, %39104) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %39106 = "torch.aten.bitwise_not"(%39105) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %39107 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %39108 = "torch.aten.gt.Scalar"(%39081, %39107) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %39109 = "torch.aten.bitwise_not"(%39108) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %39110 = "torch.aten.mul.Tensor"(%39106, %39109) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %39111 = "torch.aten.where.self"(%39110, %39103, %39086) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39112 = "torch.prim.ListConstruct"(%39111, %39111) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %39113 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %39114 = "torch.aten.cat"(%39112, %39113) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %39115 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39116 = "torch.prims.convert_element_type"(%39061, %39115) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %39117 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39118 = "torch.prims.convert_element_type"(%39114, %39117) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %39119 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %39120 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39121 = "torch.prim.ListConstruct"(%39119, %39120) : (!torch.int, !torch.int) -> !torch.list<int>
    %39122 = "torch.aten.view"(%39116, %39121) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %39123 = "torch.aten.mul.Tensor"(%39122, %39118) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %39124 = "torch.aten.cos"(%39123) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %39125 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39126 = "torch.prims.convert_element_type"(%39124, %39125) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %39127 = "torch.aten.sin"(%39123) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %39128 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39129 = "torch.prims.convert_element_type"(%39127, %39128) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %39130 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39131 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39132 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39133 = "torch.aten.slice.Tensor"(%39126, %39130, %39131, %18481, %39132) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39133, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39134 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39135 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39136 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39137 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39138 = "torch.aten.slice.Tensor"(%39133, %39134, %39135, %39136, %39137) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39138, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39139 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39140 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39141 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39142 = "torch.aten.slice.Tensor"(%39129, %39139, %39140, %18481, %39141) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39142, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39143 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39144 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39145 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39146 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39147 = "torch.aten.slice.Tensor"(%39142, %39143, %39144, %39145, %39146) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39147, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39148 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39149 = "torch.aten.unsqueeze"(%39138, %39148) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39149, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39150 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39151 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39152 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39153 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39154 = "torch.aten.slice.Tensor"(%39149, %39150, %39151, %39152, %39153) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39154, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39155 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39156 = "torch.aten.unsqueeze"(%39154, %39155) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39156, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39157 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39158 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39159 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39160 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39161 = "torch.aten.slice.Tensor"(%39156, %39157, %39158, %39159, %39160) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39161, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39162 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39163 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39164 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39165 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39166 = "torch.prim.ListConstruct"(%39162, %39163, %39164, %39165) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39167 = "torch.aten.repeat"(%39161, %39166) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39167, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %39168 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39169 = "torch.aten.unsqueeze"(%39147, %39168) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39169, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39170 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39171 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39172 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39173 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39174 = "torch.aten.slice.Tensor"(%39169, %39170, %39171, %39172, %39173) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39174, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39175 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39176 = "torch.aten.unsqueeze"(%39174, %39175) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39176, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39177 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39178 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39179 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39180 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39181 = "torch.aten.slice.Tensor"(%39176, %39177, %39178, %39179, %39180) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39181, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39182 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39183 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39184 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39185 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39186 = "torch.prim.ListConstruct"(%39182, %39183, %39184, %39185) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39187 = "torch.aten.repeat"(%39181, %39186) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39187, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %39188 = "torch.aten.mul.Tensor"(%39045, %39167) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39188, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39189 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39190 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39191 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %39192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39193 = "torch.aten.slice.Tensor"(%39045, %39189, %39190, %39191, %39192) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39193, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %39194 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39195 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %39196 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39197 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39198 = "torch.aten.slice.Tensor"(%39045, %39194, %39195, %39196, %39197) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39198, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %39199 = "torch.aten.neg"(%39198) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39199, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %39200 = "torch.prim.ListConstruct"(%39199, %39193) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %39201 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %39202 = "torch.aten.cat"(%39200, %39201) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39202, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39203 = "torch.aten.mul.Tensor"(%39202, %39187) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39203, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39204 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39205 = "torch.aten.add.Tensor"(%39188, %39203, %39204) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39205, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39206 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %39207 = "torch.constant.none"() : () -> !torch.none
    %39208 = "torch.constant.none"() : () -> !torch.none
    %39209 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %39210 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39211 = "torch.aten.arange"(%39206, %39207, %39208, %39209, %39210) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %39212 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39213 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39214 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39215 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39216 = "torch.constant.none"() : () -> !torch.none
    %39217 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %39218 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39219 = "torch.aten.arange.start_step"(%39212, %39213, %39214, %39215, %39216, %39217, %39218) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %39220 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39221 = "torch.prims.convert_element_type"(%39219, %39220) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %39222 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39223 = "torch.aten.div.Scalar"(%39221, %39222) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39224 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %39225 = "torch.aten.pow.Scalar"(%39224, %39223) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39226 = "torch.aten.reciprocal"(%39225) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39227 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %39228 = "torch.aten.mul.Scalar"(%39226, %39227) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %39229 = "torch.aten.reciprocal"(%39228) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39230 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %39231 = "torch.aten.mul.Scalar"(%39229, %39230) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %39232 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %39233 = "torch.aten.gt.Scalar"(%39231, %39232) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %39234 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39235 = "torch.aten.div.Scalar"(%39228, %39234) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39236 = "torch.aten.where.self"(%39233, %39235, %39228) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39237 = "torch.aten.reciprocal"(%39231) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39238 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %39239 = "torch.aten.mul.Scalar"(%39237, %39238) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39240 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39241 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39242 = "torch.aten.sub.Scalar"(%39239, %39240, %39241) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %39243 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39244 = "torch.aten.div.Scalar"(%39242, %39243) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39245 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39246 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39247 = "torch.aten.rsub.Scalar"(%39244, %39245, %39246) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %39248 = "torch.aten.mul.Tensor"(%39247, %39236) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39249 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39250 = "torch.aten.div.Scalar"(%39248, %39249) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39251 = "torch.aten.mul.Tensor"(%39244, %39236) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39252 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39253 = "torch.aten.add.Tensor"(%39250, %39251, %39252) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %39254 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %39255 = "torch.aten.lt.Scalar"(%39231, %39254) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %39256 = "torch.aten.bitwise_not"(%39255) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %39257 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %39258 = "torch.aten.gt.Scalar"(%39231, %39257) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %39259 = "torch.aten.bitwise_not"(%39258) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %39260 = "torch.aten.mul.Tensor"(%39256, %39259) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %39261 = "torch.aten.where.self"(%39260, %39253, %39236) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %39262 = "torch.prim.ListConstruct"(%39261, %39261) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %39263 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %39264 = "torch.aten.cat"(%39262, %39263) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %39265 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39266 = "torch.prims.convert_element_type"(%39211, %39265) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %39267 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39268 = "torch.prims.convert_element_type"(%39264, %39267) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %39269 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %39270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39271 = "torch.prim.ListConstruct"(%39269, %39270) : (!torch.int, !torch.int) -> !torch.list<int>
    %39272 = "torch.aten.view"(%39266, %39271) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %39273 = "torch.aten.mul.Tensor"(%39272, %39268) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %39274 = "torch.aten.cos"(%39273) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %39275 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39276 = "torch.prims.convert_element_type"(%39274, %39275) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %39277 = "torch.aten.sin"(%39273) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %39278 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39279 = "torch.prims.convert_element_type"(%39277, %39278) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %39280 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39281 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39282 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39283 = "torch.aten.slice.Tensor"(%39276, %39280, %39281, %18481, %39282) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39283, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39284 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39285 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39286 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39287 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39288 = "torch.aten.slice.Tensor"(%39283, %39284, %39285, %39286, %39287) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39288, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39289 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39290 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39291 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39292 = "torch.aten.slice.Tensor"(%39279, %39289, %39290, %18481, %39291) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39292, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39293 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39294 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39295 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39296 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39297 = "torch.aten.slice.Tensor"(%39292, %39293, %39294, %39295, %39296) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%39297, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %39298 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39299 = "torch.aten.unsqueeze"(%39288, %39298) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39299, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39300 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39301 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39302 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39303 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39304 = "torch.aten.slice.Tensor"(%39299, %39300, %39301, %39302, %39303) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39304, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39305 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39306 = "torch.aten.unsqueeze"(%39304, %39305) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39306, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39307 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39308 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39309 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39310 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39311 = "torch.aten.slice.Tensor"(%39306, %39307, %39308, %39309, %39310) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39311, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39312 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39313 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39314 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39315 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39316 = "torch.prim.ListConstruct"(%39312, %39313, %39314, %39315) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39317 = "torch.aten.repeat"(%39311, %39316) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39317, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %39318 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39319 = "torch.aten.unsqueeze"(%39297, %39318) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39319, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39320 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39321 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39322 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39323 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39324 = "torch.aten.slice.Tensor"(%39319, %39320, %39321, %39322, %39323) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%39324, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %39325 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39326 = "torch.aten.unsqueeze"(%39324, %39325) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39326, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39327 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39328 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39329 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39330 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39331 = "torch.aten.slice.Tensor"(%39326, %39327, %39328, %39329, %39330) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39331, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %39332 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39333 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39334 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39335 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39336 = "torch.prim.ListConstruct"(%39332, %39333, %39334, %39335) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39337 = "torch.aten.repeat"(%39331, %39336) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%39337, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %39338 = "torch.aten.mul.Tensor"(%39050, %39317) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39338, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39339 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39340 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39341 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %39342 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39343 = "torch.aten.slice.Tensor"(%39050, %39339, %39340, %39341, %39342) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39343, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %39344 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %39345 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %39346 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %39347 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39348 = "torch.aten.slice.Tensor"(%39050, %39344, %39345, %39346, %39347) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39348, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %39349 = "torch.aten.neg"(%39348) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39349, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %39350 = "torch.prim.ListConstruct"(%39349, %39343) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %39351 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %39352 = "torch.aten.cat"(%39350, %39351) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39352, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39353 = "torch.aten.mul.Tensor"(%39352, %39337) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39353, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39354 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39355 = "torch.aten.add.Tensor"(%39338, %39353, %39354) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39355, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39356 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %39357 = "torch.aten.mul.Scalar"(%arg69, %39356) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%39357, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %39358 = "torch.constant.int"() <{value = 62 : i64}> : () -> !torch.int
    %39359 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39360 = "torch.aten.add.Scalar"(%39357, %39358, %39359) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%39360, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %39361 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39362 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39363 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39364 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39365 = "torch.prim.ListConstruct"(%39361, %18477, %39362, %39363, %39364) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39366 = "torch.aten.view"(%39355, %39365) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39366, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39367 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39368 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39369 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39370 = "torch.prim.ListConstruct"(%19011, %39367, %39368, %39369) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39371 = "torch.aten.view"(%39366, %39370) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39371, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39372 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %39373 = "torch.aten.view"(%39360, %39372) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%39373, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %39374 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39375 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39376 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39377 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39378 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39379 = "torch.prim.ListConstruct"(%18479, %39374, %39375, %39376, %39377, %39378) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39380 = "torch.aten.view"(%38782, %39379) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39380, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39381 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39382 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39383 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39384 = "torch.prim.ListConstruct"(%18993, %39381, %39382, %39383) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39385 = "torch.aten.view"(%39380, %39384) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39385, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39386 = "torch.prim.ListConstruct"(%39373) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %39387 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39388 = "torch.aten.index_put"(%39385, %39386, %39371, %39387) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39388, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39389 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39390 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39391 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39392 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39393 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39394 = "torch.prim.ListConstruct"(%18479, %39389, %39390, %39391, %39392, %39393) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39395 = "torch.aten.view"(%39388, %39394) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39395, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39396 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %39397 = "torch.prim.ListConstruct"(%18479, %39396) : (!torch.int, !torch.int) -> !torch.list<int>
    %39398 = "torch.aten.view"(%39395, %39397) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39398, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %39399 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39400 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39401 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39402 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39403 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39404 = "torch.prim.ListConstruct"(%18479, %39399, %39400, %39401, %39402, %39403) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39405 = "torch.aten.view"(%39398, %39404) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39405, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39406 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39407 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39408 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39409 = "torch.prim.ListConstruct"(%18993, %39406, %39407, %39408) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39410 = "torch.aten.view"(%39405, %39409) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39410, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39411 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39412 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39413 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39414 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39415 = "torch.prim.ListConstruct"(%39411, %18477, %39412, %39413, %39414) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39416 = "torch.aten.view"(%39055, %39415) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39416, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39417 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39418 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39419 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39420 = "torch.prim.ListConstruct"(%19011, %39417, %39418, %39419) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39421 = "torch.aten.view"(%39416, %39420) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39421, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39422 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39423 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39424 = "torch.aten.add.Scalar"(%39360, %39422, %39423) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%39424, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %39425 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %39426 = "torch.aten.view"(%39424, %39425) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%39426, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %39427 = "torch.prim.ListConstruct"(%39426) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %39428 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39429 = "torch.aten.index_put"(%39410, %39427, %39421, %39428) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39429, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39430 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39431 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39432 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39433 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39434 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39435 = "torch.prim.ListConstruct"(%18479, %39430, %39431, %39432, %39433, %39434) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39436 = "torch.aten.view"(%39429, %39435) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39436, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39437 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %39438 = "torch.prim.ListConstruct"(%18479, %39437) : (!torch.int, !torch.int) -> !torch.list<int>
    %39439 = "torch.aten.view"(%39436, %39438) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.overwrite.tensor.contents"(%39439, %arg70) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> ()
    "torch.bind_symbolic_shape"(%39439, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %39440 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %39441 = "torch.aten.unsqueeze"(%39355, %39440) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39441, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39442 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39443 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39444 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39445 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39446 = "torch.prim.ListConstruct"(%39442, %18481, %39443, %39444, %39445) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39447 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39448 = "torch.aten.expand"(%39441, %39446, %39447) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39448, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39449 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39450 = "torch.aten.clone"(%39448, %39449) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39450, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39451 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39452 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39453 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39454 = "torch.prim.ListConstruct"(%39451, %18481, %39452, %39453) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39455 = "torch.aten._unsafe_view"(%39450, %39454) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39455, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39456 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %39457 = "torch.aten.unsqueeze"(%39055, %39456) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39457, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39458 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39459 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %39460 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39461 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39462 = "torch.prim.ListConstruct"(%39458, %18481, %39459, %39460, %39461) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39463 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39464 = "torch.aten.expand"(%39457, %39462, %39463) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39464, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39465 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39466 = "torch.aten.clone"(%39464, %39465) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39466, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39467 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39468 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %39469 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %39470 = "torch.prim.ListConstruct"(%39467, %18481, %39468, %39469) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39471 = "torch.aten._unsafe_view"(%39466, %39470) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39471, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39472 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39473 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39474 = "torch.aten.transpose.int"(%39205, %39472, %39473) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39474, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39475 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39476 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39477 = "torch.aten.transpose.int"(%39455, %39475, %39476) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39477, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39478 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39479 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39480 = "torch.aten.transpose.int"(%39471, %39478, %39479) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39480, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %39481 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39482 = "torch.aten.squeeze.dim"(%18570, %39481) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39482, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %39483 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39484 = "torch.aten.squeeze.dim"(%39482, %39483) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39484, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %39485 = "torch_c.to_builtin_tensor"(%39474) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %39486 = "torch_c.to_builtin_tensor"(%39477) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %39487 = "torch_c.to_builtin_tensor"(%39480) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %39488 = "torch_c.to_builtin_tensor"(%39484) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %39489 = "tensor.cast"(%39488) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %39490 = "torch_c.to_builtin_tensor"(%18449) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %39491 = "util.call"(%39485, %39486, %39487, %39490, %39489) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %39492 = "torch_c.from_builtin_tensor"(%39491) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%39492, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %39493 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39494 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39495 = "torch.aten.transpose.int"(%39492, %39493, %39494) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%39495, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %39496 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39497 = "torch.aten.clone"(%39495, %39496) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%39497, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %39498 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39499 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39500 = "torch.prim.ListConstruct"(%39498, %18481, %39499) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39501 = "torch.aten._unsafe_view"(%39497, %39500) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39501, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39502 = "torch.aten.div.Tensor"(%39501, %18451) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39502, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39503 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %39504 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %39505 = "torch.aten.clamp"(%39502, %39503, %39504) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39505, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39506 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %39507 = "torch.prims.convert_element_type"(%39505, %39506) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39507, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %39508 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39509 = "torch.aten.unsqueeze"(%18453, %39508) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %39510 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39511 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39512 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39513 = "torch.prim.ListConstruct"(%39510, %39511, %39512) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39514 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39515 = "torch.aten.expand"(%39509, %39513, %39514) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %39516 = "torch_c.to_builtin_tensor"(%39507) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %39517 = "torch_c.to_builtin_tensor"(%39515) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %39518 = "util.call"(%39516, %39517) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %39519 = "torch_c.from_builtin_tensor"(%39518) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39519, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39520 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39521 = "torch.prims.convert_element_type"(%39519, %39520) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39521, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39522 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39523 = "torch.aten.add.Tensor"(%38949, %39521, %39522) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39523, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39524 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39525 = "torch.prims.convert_element_type"(%39523, %39524) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39525, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39526 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39527 = "torch.aten.pow.Tensor_Scalar"(%39525, %39526) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39527, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39528 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %39529 = "torch.prim.ListConstruct"(%39528) : (!torch.int) -> !torch.list<int>
    %39530 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %39531 = "torch.constant.none"() : () -> !torch.none
    %39532 = "torch.aten.mean.dim"(%39527, %39529, %39530, %39531) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%39532, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %39533 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %39534 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39535 = "torch.aten.add.Scalar"(%39532, %39533, %39534) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%39535, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %39536 = "torch.aten.rsqrt"(%39535) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%39536, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %39537 = "torch.aten.mul.Tensor"(%39525, %39536) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39537, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39538 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39539 = "torch.prims.convert_element_type"(%39537, %39538) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39539, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39540 = "torch.aten.mul.Tensor"(%18455, %39539) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39540, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39541 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39542 = "torch.prims.convert_element_type"(%39540, %39541) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39542, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39543 = "torch.aten.div.Tensor"(%39542, %18457) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39543, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39544 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %39545 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %39546 = "torch.aten.clamp"(%39543, %39544, %39545) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39546, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39547 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %39548 = "torch.prims.convert_element_type"(%39546, %39547) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39548, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %39549 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39550 = "torch.aten.unsqueeze"(%18459, %39549) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %39551 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39552 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %39553 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39554 = "torch.prim.ListConstruct"(%39551, %39552, %39553) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39555 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39556 = "torch.aten.expand"(%39550, %39554, %39555) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %39557 = "torch_c.to_builtin_tensor"(%39548) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %39558 = "torch_c.to_builtin_tensor"(%39556) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %39559 = "util.call"(%39557, %39558) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %39560 = "torch_c.from_builtin_tensor"(%39559) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%39560, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %39561 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39562 = "torch.prims.convert_element_type"(%39560, %39561) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%39562, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %39563 = "torch.aten.silu"(%39562) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%39563, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %39564 = "torch.aten.div.Tensor"(%39542, %18461) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39564, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39565 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %39566 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %39567 = "torch.aten.clamp"(%39564, %39565, %39566) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39567, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39568 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %39569 = "torch.prims.convert_element_type"(%39567, %39568) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39569, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %39570 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39571 = "torch.aten.unsqueeze"(%18463, %39570) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %39572 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39573 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %39574 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39575 = "torch.prim.ListConstruct"(%39572, %39573, %39574) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39576 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39577 = "torch.aten.expand"(%39571, %39575, %39576) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %39578 = "torch_c.to_builtin_tensor"(%39569) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %39579 = "torch_c.to_builtin_tensor"(%39577) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %39580 = "util.call"(%39578, %39579) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %39581 = "torch_c.from_builtin_tensor"(%39580) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%39581, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %39582 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39583 = "torch.prims.convert_element_type"(%39581, %39582) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%39583, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %39584 = "torch.aten.mul.Tensor"(%39563, %39583) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%39584, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %39585 = "torch.aten.div.Tensor"(%39584, %18465) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%39585, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %39586 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %39587 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %39588 = "torch.aten.clamp"(%39585, %39586, %39587) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%39588, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %39589 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %39590 = "torch.prims.convert_element_type"(%39588, %39589) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%39590, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %39591 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %39592 = "torch.aten.unsqueeze"(%18467, %39591) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %39593 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39594 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39595 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %39596 = "torch.prim.ListConstruct"(%39593, %39594, %39595) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39597 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %39598 = "torch.aten.expand"(%39592, %39596, %39597) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %39599 = "torch_c.to_builtin_tensor"(%39590) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %39600 = "torch_c.to_builtin_tensor"(%39598) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %39601 = "util.call"(%39599, %39600) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %39602 = "torch_c.from_builtin_tensor"(%39601) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39602, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39603 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39604 = "torch.prims.convert_element_type"(%39602, %39603) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39604, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39605 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39606 = "torch.aten.add.Tensor"(%39523, %39604, %39605) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39606, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39607 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %39608 = "torch.prims.convert_element_type"(%39606, %39607) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39608, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39609 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %39610 = "torch.aten.pow.Tensor_Scalar"(%39608, %39609) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39610, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39611 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %39612 = "torch.prim.ListConstruct"(%39611) : (!torch.int) -> !torch.list<int>
    %39613 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %39614 = "torch.constant.none"() : () -> !torch.none
    %39615 = "torch.aten.mean.dim"(%39610, %39612, %39613, %39614) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%39615, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %39616 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %39617 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %39618 = "torch.aten.add.Scalar"(%39615, %39616, %39617) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%39618, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %39619 = "torch.aten.rsqrt"(%39618) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%39619, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %39620 = "torch.aten.mul.Tensor"(%39608, %39619) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%39620, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %39621 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39622 = "torch.prims.convert_element_type"(%39620, %39621) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39622, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39623 = "torch.aten.mul.Tensor"(%18469, %39622) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39623, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39624 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39625 = "torch.prims.convert_element_type"(%39623, %39624) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%39625, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %39626 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %39627 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %39628 = "torch.aten.transpose.int"(%18471, %39626, %39627) : (!torch.vtensor<[128256,4096],bf16>, !torch.int, !torch.int) -> !torch.vtensor<[4096,128256],bf16>
    %39629 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %39630 = "torch.prims.convert_element_type"(%39628, %39629) : (!torch.vtensor<[4096,128256],bf16>, !torch.int) -> !torch.vtensor<[4096,128256],bf16>
    %39631 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39632 = "torch.aten.mul.int"(%39631, %18481) : (!torch.int, !torch.int) -> !torch.int
    %39633 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %39634 = "torch.prim.ListConstruct"(%39632, %39633) : (!torch.int, !torch.int) -> !torch.list<int>
    %39635 = "torch.aten.view"(%39625, %39634) : (!torch.vtensor<[4,?,4096],bf16>, !torch.list<int>) -> !torch.vtensor<[?,4096],bf16>
    "torch.bind_symbolic_shape"(%39635, %18474) <{shape_expressions = #map29}> : (!torch.vtensor<[?,4096],bf16>, !torch.int) -> ()
    %39636 = "torch.aten.mm"(%39635, %39630) : (!torch.vtensor<[?,4096],bf16>, !torch.vtensor<[4096,128256],bf16>) -> !torch.vtensor<[?,128256],bf16>
    "torch.bind_symbolic_shape"(%39636, %18474) <{shape_expressions = #map30}> : (!torch.vtensor<[?,128256],bf16>, !torch.int) -> ()
    %39637 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %39638 = "torch.constant.int"() <{value = 128256 : i64}> : () -> !torch.int
    %39639 = "torch.prim.ListConstruct"(%39637, %18481, %39638) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %39640 = "torch.aten.view"(%39636, %39639) : (!torch.vtensor<[?,128256],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,128256],bf16>
    "torch.bind_symbolic_shape"(%39640, %18474) <{shape_expressions = #map31}> : (!torch.vtensor<[4,?,128256],bf16>, !torch.int) -> ()
    "func.return"(%39640) : (!torch.vtensor<[4,?,128256],bf16>) -> ()
  }) {torch.assume_strict_symbolic_shapes} : () -> ()
  "func.func"() <{arg_attrs = [{}, {}, {}, {}, {}], function_type = (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4],si64>, !torch.vtensor<[4],si64>, !torch.vtensor<[4,?],si64>, !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[4,1,128256],bf16>, sym_name = "decode_bs4"}> ({
  ^bb0(%arg62: !torch.vtensor<[4,1],si64>, %arg63: !torch.vtensor<[4],si64>, %arg64: !torch.vtensor<[4],si64>, %arg65: !torch.vtensor<[4,?],si64>, %arg66: !torch.tensor<[?,2097152],f8E4M3FNUZ>):
    %127 = "util.global.load"() <{global = @__auto.token_embd.weight}> : () -> tensor<128256x4096xbf16>
    %128 = "torch_c.from_builtin_tensor"(%127) : (tensor<128256x4096xbf16>) -> !torch.vtensor<[128256,4096],bf16>
    %129 = "util.global.load"() <{global = @__auto.blk.0.attn_norm.weight}> : () -> tensor<4096xbf16>
    %130 = "torch_c.from_builtin_tensor"(%129) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %131 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %132 = "torch_c.from_builtin_tensor"(%131) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %133 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %134 = "torch_c.from_builtin_tensor"(%133) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %135 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %136 = "torch_c.from_builtin_tensor"(%135) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %137 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %138 = "torch_c.from_builtin_tensor"(%137) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %139 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %140 = "torch_c.from_builtin_tensor"(%139) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %141 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %142 = "torch_c.from_builtin_tensor"(%141) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %143 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %144 = "torch_c.from_builtin_tensor"(%143) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %145 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %146 = "torch_c.from_builtin_tensor"(%145) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %147 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %148 = "torch_c.from_builtin_tensor"(%147) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %149 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %150 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %151 = "util.global.load"() <{global = @__auto.blk.0.attn_scale}> : () -> tensor<f32>
    %152 = "torch_c.from_builtin_tensor"(%151) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %153 = "util.global.load"() <{global = @"__auto.blk.0.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %154 = "torch_c.from_builtin_tensor"(%153) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %155 = "util.global.load"() <{global = @"__auto.blk.0.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %156 = "torch_c.from_builtin_tensor"(%155) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %157 = "util.global.load"() <{global = @__auto.blk.0.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %158 = "torch_c.from_builtin_tensor"(%157) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %159 = "util.global.load"() <{global = @"__auto.blk.0.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %160 = "torch_c.from_builtin_tensor"(%159) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %161 = "util.global.load"() <{global = @"__auto.blk.0.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %162 = "torch_c.from_builtin_tensor"(%161) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %163 = "util.global.load"() <{global = @"__auto.blk.0.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %164 = "torch_c.from_builtin_tensor"(%163) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %165 = "util.global.load"() <{global = @"__auto.blk.0.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %166 = "torch_c.from_builtin_tensor"(%165) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %167 = "util.global.load"() <{global = @"__auto.blk.0.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %168 = "torch_c.from_builtin_tensor"(%167) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %169 = "util.global.load"() <{global = @"__auto.blk.0.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %170 = "torch_c.from_builtin_tensor"(%169) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %171 = "util.global.load"() <{global = @__auto.blk.1.attn_norm.weight}> : () -> tensor<4096xbf16>
    %172 = "torch_c.from_builtin_tensor"(%171) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %173 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %174 = "torch_c.from_builtin_tensor"(%173) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %175 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %176 = "torch_c.from_builtin_tensor"(%175) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %177 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %178 = "torch_c.from_builtin_tensor"(%177) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %179 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %180 = "torch_c.from_builtin_tensor"(%179) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %181 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %182 = "torch_c.from_builtin_tensor"(%181) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %183 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %184 = "torch_c.from_builtin_tensor"(%183) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %185 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %186 = "torch_c.from_builtin_tensor"(%185) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %187 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %188 = "torch_c.from_builtin_tensor"(%187) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %189 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %190 = "torch_c.from_builtin_tensor"(%189) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %191 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %192 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %193 = "util.global.load"() <{global = @__auto.blk.1.attn_scale}> : () -> tensor<f32>
    %194 = "torch_c.from_builtin_tensor"(%193) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %195 = "util.global.load"() <{global = @"__auto.blk.1.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %196 = "torch_c.from_builtin_tensor"(%195) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %197 = "util.global.load"() <{global = @"__auto.blk.1.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %198 = "torch_c.from_builtin_tensor"(%197) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %199 = "util.global.load"() <{global = @__auto.blk.1.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %200 = "torch_c.from_builtin_tensor"(%199) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %201 = "util.global.load"() <{global = @"__auto.blk.1.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %202 = "torch_c.from_builtin_tensor"(%201) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %203 = "util.global.load"() <{global = @"__auto.blk.1.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %204 = "torch_c.from_builtin_tensor"(%203) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %205 = "util.global.load"() <{global = @"__auto.blk.1.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %206 = "torch_c.from_builtin_tensor"(%205) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %207 = "util.global.load"() <{global = @"__auto.blk.1.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %208 = "torch_c.from_builtin_tensor"(%207) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %209 = "util.global.load"() <{global = @"__auto.blk.1.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %210 = "torch_c.from_builtin_tensor"(%209) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %211 = "util.global.load"() <{global = @"__auto.blk.1.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %212 = "torch_c.from_builtin_tensor"(%211) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %213 = "util.global.load"() <{global = @__auto.blk.2.attn_norm.weight}> : () -> tensor<4096xbf16>
    %214 = "torch_c.from_builtin_tensor"(%213) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %215 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %216 = "torch_c.from_builtin_tensor"(%215) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %217 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %218 = "torch_c.from_builtin_tensor"(%217) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %219 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %220 = "torch_c.from_builtin_tensor"(%219) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %221 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %222 = "torch_c.from_builtin_tensor"(%221) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %223 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %224 = "torch_c.from_builtin_tensor"(%223) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %225 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %226 = "torch_c.from_builtin_tensor"(%225) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %227 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %228 = "torch_c.from_builtin_tensor"(%227) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %229 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %230 = "torch_c.from_builtin_tensor"(%229) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %231 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %232 = "torch_c.from_builtin_tensor"(%231) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %233 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %234 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %235 = "util.global.load"() <{global = @__auto.blk.2.attn_scale}> : () -> tensor<f32>
    %236 = "torch_c.from_builtin_tensor"(%235) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %237 = "util.global.load"() <{global = @"__auto.blk.2.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %238 = "torch_c.from_builtin_tensor"(%237) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %239 = "util.global.load"() <{global = @"__auto.blk.2.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %240 = "torch_c.from_builtin_tensor"(%239) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %241 = "util.global.load"() <{global = @__auto.blk.2.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %242 = "torch_c.from_builtin_tensor"(%241) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %243 = "util.global.load"() <{global = @"__auto.blk.2.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %244 = "torch_c.from_builtin_tensor"(%243) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %245 = "util.global.load"() <{global = @"__auto.blk.2.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %246 = "torch_c.from_builtin_tensor"(%245) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %247 = "util.global.load"() <{global = @"__auto.blk.2.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %248 = "torch_c.from_builtin_tensor"(%247) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %249 = "util.global.load"() <{global = @"__auto.blk.2.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %250 = "torch_c.from_builtin_tensor"(%249) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %251 = "util.global.load"() <{global = @"__auto.blk.2.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %252 = "torch_c.from_builtin_tensor"(%251) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %253 = "util.global.load"() <{global = @"__auto.blk.2.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %254 = "torch_c.from_builtin_tensor"(%253) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %255 = "util.global.load"() <{global = @__auto.blk.3.attn_norm.weight}> : () -> tensor<4096xbf16>
    %256 = "torch_c.from_builtin_tensor"(%255) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %257 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %258 = "torch_c.from_builtin_tensor"(%257) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %259 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %260 = "torch_c.from_builtin_tensor"(%259) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %261 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %262 = "torch_c.from_builtin_tensor"(%261) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %263 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %264 = "torch_c.from_builtin_tensor"(%263) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %265 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %266 = "torch_c.from_builtin_tensor"(%265) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %267 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %268 = "torch_c.from_builtin_tensor"(%267) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %269 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %270 = "torch_c.from_builtin_tensor"(%269) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %271 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %272 = "torch_c.from_builtin_tensor"(%271) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %273 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %274 = "torch_c.from_builtin_tensor"(%273) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %275 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %276 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %277 = "util.global.load"() <{global = @__auto.blk.3.attn_scale}> : () -> tensor<f32>
    %278 = "torch_c.from_builtin_tensor"(%277) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %279 = "util.global.load"() <{global = @"__auto.blk.3.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %280 = "torch_c.from_builtin_tensor"(%279) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %281 = "util.global.load"() <{global = @"__auto.blk.3.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %282 = "torch_c.from_builtin_tensor"(%281) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %283 = "util.global.load"() <{global = @__auto.blk.3.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %284 = "torch_c.from_builtin_tensor"(%283) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %285 = "util.global.load"() <{global = @"__auto.blk.3.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %286 = "torch_c.from_builtin_tensor"(%285) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %287 = "util.global.load"() <{global = @"__auto.blk.3.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %288 = "torch_c.from_builtin_tensor"(%287) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %289 = "util.global.load"() <{global = @"__auto.blk.3.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %290 = "torch_c.from_builtin_tensor"(%289) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %291 = "util.global.load"() <{global = @"__auto.blk.3.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %292 = "torch_c.from_builtin_tensor"(%291) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %293 = "util.global.load"() <{global = @"__auto.blk.3.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %294 = "torch_c.from_builtin_tensor"(%293) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %295 = "util.global.load"() <{global = @"__auto.blk.3.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %296 = "torch_c.from_builtin_tensor"(%295) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %297 = "util.global.load"() <{global = @__auto.blk.4.attn_norm.weight}> : () -> tensor<4096xbf16>
    %298 = "torch_c.from_builtin_tensor"(%297) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %299 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %300 = "torch_c.from_builtin_tensor"(%299) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %301 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %302 = "torch_c.from_builtin_tensor"(%301) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %303 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %304 = "torch_c.from_builtin_tensor"(%303) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %305 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %306 = "torch_c.from_builtin_tensor"(%305) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %307 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %308 = "torch_c.from_builtin_tensor"(%307) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %309 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %310 = "torch_c.from_builtin_tensor"(%309) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %311 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %312 = "torch_c.from_builtin_tensor"(%311) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %313 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %314 = "torch_c.from_builtin_tensor"(%313) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %315 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %316 = "torch_c.from_builtin_tensor"(%315) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %317 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %318 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %319 = "util.global.load"() <{global = @__auto.blk.4.attn_scale}> : () -> tensor<f32>
    %320 = "torch_c.from_builtin_tensor"(%319) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %321 = "util.global.load"() <{global = @"__auto.blk.4.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %322 = "torch_c.from_builtin_tensor"(%321) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %323 = "util.global.load"() <{global = @"__auto.blk.4.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %324 = "torch_c.from_builtin_tensor"(%323) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %325 = "util.global.load"() <{global = @__auto.blk.4.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %326 = "torch_c.from_builtin_tensor"(%325) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %327 = "util.global.load"() <{global = @"__auto.blk.4.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %328 = "torch_c.from_builtin_tensor"(%327) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %329 = "util.global.load"() <{global = @"__auto.blk.4.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %330 = "torch_c.from_builtin_tensor"(%329) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %331 = "util.global.load"() <{global = @"__auto.blk.4.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %332 = "torch_c.from_builtin_tensor"(%331) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %333 = "util.global.load"() <{global = @"__auto.blk.4.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %334 = "torch_c.from_builtin_tensor"(%333) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %335 = "util.global.load"() <{global = @"__auto.blk.4.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %336 = "torch_c.from_builtin_tensor"(%335) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %337 = "util.global.load"() <{global = @"__auto.blk.4.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %338 = "torch_c.from_builtin_tensor"(%337) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %339 = "util.global.load"() <{global = @__auto.blk.5.attn_norm.weight}> : () -> tensor<4096xbf16>
    %340 = "torch_c.from_builtin_tensor"(%339) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %341 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %342 = "torch_c.from_builtin_tensor"(%341) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %343 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %344 = "torch_c.from_builtin_tensor"(%343) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %345 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %346 = "torch_c.from_builtin_tensor"(%345) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %347 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %348 = "torch_c.from_builtin_tensor"(%347) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %349 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %350 = "torch_c.from_builtin_tensor"(%349) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %351 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %352 = "torch_c.from_builtin_tensor"(%351) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %353 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %354 = "torch_c.from_builtin_tensor"(%353) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %355 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %356 = "torch_c.from_builtin_tensor"(%355) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %357 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %358 = "torch_c.from_builtin_tensor"(%357) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %359 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %360 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %361 = "util.global.load"() <{global = @__auto.blk.5.attn_scale}> : () -> tensor<f32>
    %362 = "torch_c.from_builtin_tensor"(%361) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %363 = "util.global.load"() <{global = @"__auto.blk.5.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %364 = "torch_c.from_builtin_tensor"(%363) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %365 = "util.global.load"() <{global = @"__auto.blk.5.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %366 = "torch_c.from_builtin_tensor"(%365) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %367 = "util.global.load"() <{global = @__auto.blk.5.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %368 = "torch_c.from_builtin_tensor"(%367) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %369 = "util.global.load"() <{global = @"__auto.blk.5.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %370 = "torch_c.from_builtin_tensor"(%369) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %371 = "util.global.load"() <{global = @"__auto.blk.5.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %372 = "torch_c.from_builtin_tensor"(%371) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %373 = "util.global.load"() <{global = @"__auto.blk.5.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %374 = "torch_c.from_builtin_tensor"(%373) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %375 = "util.global.load"() <{global = @"__auto.blk.5.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %376 = "torch_c.from_builtin_tensor"(%375) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %377 = "util.global.load"() <{global = @"__auto.blk.5.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %378 = "torch_c.from_builtin_tensor"(%377) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %379 = "util.global.load"() <{global = @"__auto.blk.5.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %380 = "torch_c.from_builtin_tensor"(%379) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %381 = "util.global.load"() <{global = @__auto.blk.6.attn_norm.weight}> : () -> tensor<4096xbf16>
    %382 = "torch_c.from_builtin_tensor"(%381) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %383 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %384 = "torch_c.from_builtin_tensor"(%383) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %385 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %386 = "torch_c.from_builtin_tensor"(%385) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %387 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %388 = "torch_c.from_builtin_tensor"(%387) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %389 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %390 = "torch_c.from_builtin_tensor"(%389) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %391 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %392 = "torch_c.from_builtin_tensor"(%391) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %393 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %394 = "torch_c.from_builtin_tensor"(%393) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %395 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %396 = "torch_c.from_builtin_tensor"(%395) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %397 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %398 = "torch_c.from_builtin_tensor"(%397) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %399 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %400 = "torch_c.from_builtin_tensor"(%399) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %401 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %402 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %403 = "util.global.load"() <{global = @__auto.blk.6.attn_scale}> : () -> tensor<f32>
    %404 = "torch_c.from_builtin_tensor"(%403) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %405 = "util.global.load"() <{global = @"__auto.blk.6.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %406 = "torch_c.from_builtin_tensor"(%405) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %407 = "util.global.load"() <{global = @"__auto.blk.6.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %408 = "torch_c.from_builtin_tensor"(%407) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %409 = "util.global.load"() <{global = @__auto.blk.6.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %410 = "torch_c.from_builtin_tensor"(%409) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %411 = "util.global.load"() <{global = @"__auto.blk.6.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %412 = "torch_c.from_builtin_tensor"(%411) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %413 = "util.global.load"() <{global = @"__auto.blk.6.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %414 = "torch_c.from_builtin_tensor"(%413) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %415 = "util.global.load"() <{global = @"__auto.blk.6.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %416 = "torch_c.from_builtin_tensor"(%415) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %417 = "util.global.load"() <{global = @"__auto.blk.6.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %418 = "torch_c.from_builtin_tensor"(%417) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %419 = "util.global.load"() <{global = @"__auto.blk.6.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %420 = "torch_c.from_builtin_tensor"(%419) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %421 = "util.global.load"() <{global = @"__auto.blk.6.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %422 = "torch_c.from_builtin_tensor"(%421) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %423 = "util.global.load"() <{global = @__auto.blk.7.attn_norm.weight}> : () -> tensor<4096xbf16>
    %424 = "torch_c.from_builtin_tensor"(%423) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %425 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %426 = "torch_c.from_builtin_tensor"(%425) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %427 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %428 = "torch_c.from_builtin_tensor"(%427) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %429 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %430 = "torch_c.from_builtin_tensor"(%429) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %431 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %432 = "torch_c.from_builtin_tensor"(%431) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %433 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %434 = "torch_c.from_builtin_tensor"(%433) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %435 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %436 = "torch_c.from_builtin_tensor"(%435) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %437 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %438 = "torch_c.from_builtin_tensor"(%437) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %439 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %440 = "torch_c.from_builtin_tensor"(%439) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %441 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %442 = "torch_c.from_builtin_tensor"(%441) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %443 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %444 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %445 = "util.global.load"() <{global = @__auto.blk.7.attn_scale}> : () -> tensor<f32>
    %446 = "torch_c.from_builtin_tensor"(%445) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %447 = "util.global.load"() <{global = @"__auto.blk.7.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %448 = "torch_c.from_builtin_tensor"(%447) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %449 = "util.global.load"() <{global = @"__auto.blk.7.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %450 = "torch_c.from_builtin_tensor"(%449) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %451 = "util.global.load"() <{global = @__auto.blk.7.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %452 = "torch_c.from_builtin_tensor"(%451) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %453 = "util.global.load"() <{global = @"__auto.blk.7.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %454 = "torch_c.from_builtin_tensor"(%453) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %455 = "util.global.load"() <{global = @"__auto.blk.7.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %456 = "torch_c.from_builtin_tensor"(%455) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %457 = "util.global.load"() <{global = @"__auto.blk.7.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %458 = "torch_c.from_builtin_tensor"(%457) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %459 = "util.global.load"() <{global = @"__auto.blk.7.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %460 = "torch_c.from_builtin_tensor"(%459) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %461 = "util.global.load"() <{global = @"__auto.blk.7.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %462 = "torch_c.from_builtin_tensor"(%461) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %463 = "util.global.load"() <{global = @"__auto.blk.7.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %464 = "torch_c.from_builtin_tensor"(%463) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %465 = "util.global.load"() <{global = @__auto.blk.8.attn_norm.weight}> : () -> tensor<4096xbf16>
    %466 = "torch_c.from_builtin_tensor"(%465) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %467 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %468 = "torch_c.from_builtin_tensor"(%467) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %469 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %470 = "torch_c.from_builtin_tensor"(%469) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %471 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %472 = "torch_c.from_builtin_tensor"(%471) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %473 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %474 = "torch_c.from_builtin_tensor"(%473) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %475 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %476 = "torch_c.from_builtin_tensor"(%475) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %477 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %478 = "torch_c.from_builtin_tensor"(%477) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %479 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %480 = "torch_c.from_builtin_tensor"(%479) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %481 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %482 = "torch_c.from_builtin_tensor"(%481) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %483 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %484 = "torch_c.from_builtin_tensor"(%483) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %485 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %486 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %487 = "util.global.load"() <{global = @__auto.blk.8.attn_scale}> : () -> tensor<f32>
    %488 = "torch_c.from_builtin_tensor"(%487) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %489 = "util.global.load"() <{global = @"__auto.blk.8.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %490 = "torch_c.from_builtin_tensor"(%489) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %491 = "util.global.load"() <{global = @"__auto.blk.8.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %492 = "torch_c.from_builtin_tensor"(%491) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %493 = "util.global.load"() <{global = @__auto.blk.8.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %494 = "torch_c.from_builtin_tensor"(%493) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %495 = "util.global.load"() <{global = @"__auto.blk.8.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %496 = "torch_c.from_builtin_tensor"(%495) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %497 = "util.global.load"() <{global = @"__auto.blk.8.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %498 = "torch_c.from_builtin_tensor"(%497) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %499 = "util.global.load"() <{global = @"__auto.blk.8.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %500 = "torch_c.from_builtin_tensor"(%499) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %501 = "util.global.load"() <{global = @"__auto.blk.8.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %502 = "torch_c.from_builtin_tensor"(%501) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %503 = "util.global.load"() <{global = @"__auto.blk.8.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %504 = "torch_c.from_builtin_tensor"(%503) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %505 = "util.global.load"() <{global = @"__auto.blk.8.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %506 = "torch_c.from_builtin_tensor"(%505) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %507 = "util.global.load"() <{global = @__auto.blk.9.attn_norm.weight}> : () -> tensor<4096xbf16>
    %508 = "torch_c.from_builtin_tensor"(%507) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %509 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %510 = "torch_c.from_builtin_tensor"(%509) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %511 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %512 = "torch_c.from_builtin_tensor"(%511) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %513 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %514 = "torch_c.from_builtin_tensor"(%513) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %515 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %516 = "torch_c.from_builtin_tensor"(%515) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %517 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %518 = "torch_c.from_builtin_tensor"(%517) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %519 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %520 = "torch_c.from_builtin_tensor"(%519) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %521 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %522 = "torch_c.from_builtin_tensor"(%521) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %523 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %524 = "torch_c.from_builtin_tensor"(%523) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %525 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %526 = "torch_c.from_builtin_tensor"(%525) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %527 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %528 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %529 = "util.global.load"() <{global = @__auto.blk.9.attn_scale}> : () -> tensor<f32>
    %530 = "torch_c.from_builtin_tensor"(%529) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %531 = "util.global.load"() <{global = @"__auto.blk.9.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %532 = "torch_c.from_builtin_tensor"(%531) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %533 = "util.global.load"() <{global = @"__auto.blk.9.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %534 = "torch_c.from_builtin_tensor"(%533) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %535 = "util.global.load"() <{global = @__auto.blk.9.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %536 = "torch_c.from_builtin_tensor"(%535) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %537 = "util.global.load"() <{global = @"__auto.blk.9.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %538 = "torch_c.from_builtin_tensor"(%537) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %539 = "util.global.load"() <{global = @"__auto.blk.9.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %540 = "torch_c.from_builtin_tensor"(%539) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %541 = "util.global.load"() <{global = @"__auto.blk.9.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %542 = "torch_c.from_builtin_tensor"(%541) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %543 = "util.global.load"() <{global = @"__auto.blk.9.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %544 = "torch_c.from_builtin_tensor"(%543) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %545 = "util.global.load"() <{global = @"__auto.blk.9.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %546 = "torch_c.from_builtin_tensor"(%545) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %547 = "util.global.load"() <{global = @"__auto.blk.9.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %548 = "torch_c.from_builtin_tensor"(%547) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %549 = "util.global.load"() <{global = @__auto.blk.10.attn_norm.weight}> : () -> tensor<4096xbf16>
    %550 = "torch_c.from_builtin_tensor"(%549) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %551 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %552 = "torch_c.from_builtin_tensor"(%551) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %553 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %554 = "torch_c.from_builtin_tensor"(%553) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %555 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %556 = "torch_c.from_builtin_tensor"(%555) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %557 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %558 = "torch_c.from_builtin_tensor"(%557) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %559 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %560 = "torch_c.from_builtin_tensor"(%559) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %561 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %562 = "torch_c.from_builtin_tensor"(%561) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %563 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %564 = "torch_c.from_builtin_tensor"(%563) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %565 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %566 = "torch_c.from_builtin_tensor"(%565) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %567 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %568 = "torch_c.from_builtin_tensor"(%567) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %569 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %570 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %571 = "util.global.load"() <{global = @__auto.blk.10.attn_scale}> : () -> tensor<f32>
    %572 = "torch_c.from_builtin_tensor"(%571) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %573 = "util.global.load"() <{global = @"__auto.blk.10.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %574 = "torch_c.from_builtin_tensor"(%573) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %575 = "util.global.load"() <{global = @"__auto.blk.10.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %576 = "torch_c.from_builtin_tensor"(%575) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %577 = "util.global.load"() <{global = @__auto.blk.10.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %578 = "torch_c.from_builtin_tensor"(%577) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %579 = "util.global.load"() <{global = @"__auto.blk.10.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %580 = "torch_c.from_builtin_tensor"(%579) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %581 = "util.global.load"() <{global = @"__auto.blk.10.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %582 = "torch_c.from_builtin_tensor"(%581) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %583 = "util.global.load"() <{global = @"__auto.blk.10.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %584 = "torch_c.from_builtin_tensor"(%583) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %585 = "util.global.load"() <{global = @"__auto.blk.10.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %586 = "torch_c.from_builtin_tensor"(%585) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %587 = "util.global.load"() <{global = @"__auto.blk.10.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %588 = "torch_c.from_builtin_tensor"(%587) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %589 = "util.global.load"() <{global = @"__auto.blk.10.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %590 = "torch_c.from_builtin_tensor"(%589) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %591 = "util.global.load"() <{global = @__auto.blk.11.attn_norm.weight}> : () -> tensor<4096xbf16>
    %592 = "torch_c.from_builtin_tensor"(%591) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %593 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %594 = "torch_c.from_builtin_tensor"(%593) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %595 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %596 = "torch_c.from_builtin_tensor"(%595) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %597 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %598 = "torch_c.from_builtin_tensor"(%597) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %599 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %600 = "torch_c.from_builtin_tensor"(%599) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %601 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %602 = "torch_c.from_builtin_tensor"(%601) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %603 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %604 = "torch_c.from_builtin_tensor"(%603) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %605 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %606 = "torch_c.from_builtin_tensor"(%605) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %607 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %608 = "torch_c.from_builtin_tensor"(%607) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %609 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %610 = "torch_c.from_builtin_tensor"(%609) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %611 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %612 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %613 = "util.global.load"() <{global = @__auto.blk.11.attn_scale}> : () -> tensor<f32>
    %614 = "torch_c.from_builtin_tensor"(%613) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %615 = "util.global.load"() <{global = @"__auto.blk.11.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %616 = "torch_c.from_builtin_tensor"(%615) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %617 = "util.global.load"() <{global = @"__auto.blk.11.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %618 = "torch_c.from_builtin_tensor"(%617) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %619 = "util.global.load"() <{global = @__auto.blk.11.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %620 = "torch_c.from_builtin_tensor"(%619) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %621 = "util.global.load"() <{global = @"__auto.blk.11.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %622 = "torch_c.from_builtin_tensor"(%621) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %623 = "util.global.load"() <{global = @"__auto.blk.11.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %624 = "torch_c.from_builtin_tensor"(%623) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %625 = "util.global.load"() <{global = @"__auto.blk.11.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %626 = "torch_c.from_builtin_tensor"(%625) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %627 = "util.global.load"() <{global = @"__auto.blk.11.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %628 = "torch_c.from_builtin_tensor"(%627) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %629 = "util.global.load"() <{global = @"__auto.blk.11.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %630 = "torch_c.from_builtin_tensor"(%629) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %631 = "util.global.load"() <{global = @"__auto.blk.11.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %632 = "torch_c.from_builtin_tensor"(%631) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %633 = "util.global.load"() <{global = @__auto.blk.12.attn_norm.weight}> : () -> tensor<4096xbf16>
    %634 = "torch_c.from_builtin_tensor"(%633) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %635 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %636 = "torch_c.from_builtin_tensor"(%635) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %637 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %638 = "torch_c.from_builtin_tensor"(%637) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %639 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %640 = "torch_c.from_builtin_tensor"(%639) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %641 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %642 = "torch_c.from_builtin_tensor"(%641) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %643 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %644 = "torch_c.from_builtin_tensor"(%643) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %645 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %646 = "torch_c.from_builtin_tensor"(%645) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %647 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %648 = "torch_c.from_builtin_tensor"(%647) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %649 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %650 = "torch_c.from_builtin_tensor"(%649) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %651 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %652 = "torch_c.from_builtin_tensor"(%651) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %653 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %654 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %655 = "util.global.load"() <{global = @__auto.blk.12.attn_scale}> : () -> tensor<f32>
    %656 = "torch_c.from_builtin_tensor"(%655) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %657 = "util.global.load"() <{global = @"__auto.blk.12.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %658 = "torch_c.from_builtin_tensor"(%657) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %659 = "util.global.load"() <{global = @"__auto.blk.12.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %660 = "torch_c.from_builtin_tensor"(%659) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %661 = "util.global.load"() <{global = @__auto.blk.12.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %662 = "torch_c.from_builtin_tensor"(%661) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %663 = "util.global.load"() <{global = @"__auto.blk.12.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %664 = "torch_c.from_builtin_tensor"(%663) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %665 = "util.global.load"() <{global = @"__auto.blk.12.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %666 = "torch_c.from_builtin_tensor"(%665) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %667 = "util.global.load"() <{global = @"__auto.blk.12.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %668 = "torch_c.from_builtin_tensor"(%667) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %669 = "util.global.load"() <{global = @"__auto.blk.12.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %670 = "torch_c.from_builtin_tensor"(%669) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %671 = "util.global.load"() <{global = @"__auto.blk.12.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %672 = "torch_c.from_builtin_tensor"(%671) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %673 = "util.global.load"() <{global = @"__auto.blk.12.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %674 = "torch_c.from_builtin_tensor"(%673) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %675 = "util.global.load"() <{global = @__auto.blk.13.attn_norm.weight}> : () -> tensor<4096xbf16>
    %676 = "torch_c.from_builtin_tensor"(%675) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %677 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %678 = "torch_c.from_builtin_tensor"(%677) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %679 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %680 = "torch_c.from_builtin_tensor"(%679) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %681 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %682 = "torch_c.from_builtin_tensor"(%681) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %683 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %684 = "torch_c.from_builtin_tensor"(%683) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %685 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %686 = "torch_c.from_builtin_tensor"(%685) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %687 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %688 = "torch_c.from_builtin_tensor"(%687) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %689 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %690 = "torch_c.from_builtin_tensor"(%689) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %691 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %692 = "torch_c.from_builtin_tensor"(%691) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %693 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %694 = "torch_c.from_builtin_tensor"(%693) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %695 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %696 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %697 = "util.global.load"() <{global = @__auto.blk.13.attn_scale}> : () -> tensor<f32>
    %698 = "torch_c.from_builtin_tensor"(%697) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %699 = "util.global.load"() <{global = @"__auto.blk.13.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %700 = "torch_c.from_builtin_tensor"(%699) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %701 = "util.global.load"() <{global = @"__auto.blk.13.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %702 = "torch_c.from_builtin_tensor"(%701) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %703 = "util.global.load"() <{global = @__auto.blk.13.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %704 = "torch_c.from_builtin_tensor"(%703) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %705 = "util.global.load"() <{global = @"__auto.blk.13.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %706 = "torch_c.from_builtin_tensor"(%705) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %707 = "util.global.load"() <{global = @"__auto.blk.13.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %708 = "torch_c.from_builtin_tensor"(%707) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %709 = "util.global.load"() <{global = @"__auto.blk.13.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %710 = "torch_c.from_builtin_tensor"(%709) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %711 = "util.global.load"() <{global = @"__auto.blk.13.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %712 = "torch_c.from_builtin_tensor"(%711) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %713 = "util.global.load"() <{global = @"__auto.blk.13.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %714 = "torch_c.from_builtin_tensor"(%713) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %715 = "util.global.load"() <{global = @"__auto.blk.13.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %716 = "torch_c.from_builtin_tensor"(%715) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %717 = "util.global.load"() <{global = @__auto.blk.14.attn_norm.weight}> : () -> tensor<4096xbf16>
    %718 = "torch_c.from_builtin_tensor"(%717) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %719 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %720 = "torch_c.from_builtin_tensor"(%719) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %721 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %722 = "torch_c.from_builtin_tensor"(%721) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %723 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %724 = "torch_c.from_builtin_tensor"(%723) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %725 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %726 = "torch_c.from_builtin_tensor"(%725) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %727 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %728 = "torch_c.from_builtin_tensor"(%727) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %729 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %730 = "torch_c.from_builtin_tensor"(%729) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %731 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %732 = "torch_c.from_builtin_tensor"(%731) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %733 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %734 = "torch_c.from_builtin_tensor"(%733) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %735 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %736 = "torch_c.from_builtin_tensor"(%735) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %737 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %738 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %739 = "util.global.load"() <{global = @__auto.blk.14.attn_scale}> : () -> tensor<f32>
    %740 = "torch_c.from_builtin_tensor"(%739) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %741 = "util.global.load"() <{global = @"__auto.blk.14.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %742 = "torch_c.from_builtin_tensor"(%741) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %743 = "util.global.load"() <{global = @"__auto.blk.14.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %744 = "torch_c.from_builtin_tensor"(%743) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %745 = "util.global.load"() <{global = @__auto.blk.14.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %746 = "torch_c.from_builtin_tensor"(%745) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %747 = "util.global.load"() <{global = @"__auto.blk.14.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %748 = "torch_c.from_builtin_tensor"(%747) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %749 = "util.global.load"() <{global = @"__auto.blk.14.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %750 = "torch_c.from_builtin_tensor"(%749) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %751 = "util.global.load"() <{global = @"__auto.blk.14.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %752 = "torch_c.from_builtin_tensor"(%751) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %753 = "util.global.load"() <{global = @"__auto.blk.14.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %754 = "torch_c.from_builtin_tensor"(%753) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %755 = "util.global.load"() <{global = @"__auto.blk.14.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %756 = "torch_c.from_builtin_tensor"(%755) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %757 = "util.global.load"() <{global = @"__auto.blk.14.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %758 = "torch_c.from_builtin_tensor"(%757) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %759 = "util.global.load"() <{global = @__auto.blk.15.attn_norm.weight}> : () -> tensor<4096xbf16>
    %760 = "torch_c.from_builtin_tensor"(%759) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %761 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %762 = "torch_c.from_builtin_tensor"(%761) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %763 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %764 = "torch_c.from_builtin_tensor"(%763) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %765 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %766 = "torch_c.from_builtin_tensor"(%765) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %767 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %768 = "torch_c.from_builtin_tensor"(%767) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %769 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %770 = "torch_c.from_builtin_tensor"(%769) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %771 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %772 = "torch_c.from_builtin_tensor"(%771) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %773 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %774 = "torch_c.from_builtin_tensor"(%773) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %775 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %776 = "torch_c.from_builtin_tensor"(%775) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %777 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %778 = "torch_c.from_builtin_tensor"(%777) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %779 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %780 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %781 = "util.global.load"() <{global = @__auto.blk.15.attn_scale}> : () -> tensor<f32>
    %782 = "torch_c.from_builtin_tensor"(%781) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %783 = "util.global.load"() <{global = @"__auto.blk.15.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %784 = "torch_c.from_builtin_tensor"(%783) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %785 = "util.global.load"() <{global = @"__auto.blk.15.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %786 = "torch_c.from_builtin_tensor"(%785) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %787 = "util.global.load"() <{global = @__auto.blk.15.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %788 = "torch_c.from_builtin_tensor"(%787) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %789 = "util.global.load"() <{global = @"__auto.blk.15.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %790 = "torch_c.from_builtin_tensor"(%789) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %791 = "util.global.load"() <{global = @"__auto.blk.15.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %792 = "torch_c.from_builtin_tensor"(%791) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %793 = "util.global.load"() <{global = @"__auto.blk.15.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %794 = "torch_c.from_builtin_tensor"(%793) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %795 = "util.global.load"() <{global = @"__auto.blk.15.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %796 = "torch_c.from_builtin_tensor"(%795) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %797 = "util.global.load"() <{global = @"__auto.blk.15.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %798 = "torch_c.from_builtin_tensor"(%797) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %799 = "util.global.load"() <{global = @"__auto.blk.15.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %800 = "torch_c.from_builtin_tensor"(%799) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %801 = "util.global.load"() <{global = @__auto.blk.16.attn_norm.weight}> : () -> tensor<4096xbf16>
    %802 = "torch_c.from_builtin_tensor"(%801) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %803 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %804 = "torch_c.from_builtin_tensor"(%803) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %805 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %806 = "torch_c.from_builtin_tensor"(%805) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %807 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %808 = "torch_c.from_builtin_tensor"(%807) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %809 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %810 = "torch_c.from_builtin_tensor"(%809) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %811 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %812 = "torch_c.from_builtin_tensor"(%811) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %813 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %814 = "torch_c.from_builtin_tensor"(%813) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %815 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %816 = "torch_c.from_builtin_tensor"(%815) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %817 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %818 = "torch_c.from_builtin_tensor"(%817) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %819 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %820 = "torch_c.from_builtin_tensor"(%819) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %821 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %822 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %823 = "util.global.load"() <{global = @__auto.blk.16.attn_scale}> : () -> tensor<f32>
    %824 = "torch_c.from_builtin_tensor"(%823) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %825 = "util.global.load"() <{global = @"__auto.blk.16.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %826 = "torch_c.from_builtin_tensor"(%825) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %827 = "util.global.load"() <{global = @"__auto.blk.16.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %828 = "torch_c.from_builtin_tensor"(%827) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %829 = "util.global.load"() <{global = @__auto.blk.16.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %830 = "torch_c.from_builtin_tensor"(%829) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %831 = "util.global.load"() <{global = @"__auto.blk.16.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %832 = "torch_c.from_builtin_tensor"(%831) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %833 = "util.global.load"() <{global = @"__auto.blk.16.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %834 = "torch_c.from_builtin_tensor"(%833) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %835 = "util.global.load"() <{global = @"__auto.blk.16.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %836 = "torch_c.from_builtin_tensor"(%835) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %837 = "util.global.load"() <{global = @"__auto.blk.16.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %838 = "torch_c.from_builtin_tensor"(%837) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %839 = "util.global.load"() <{global = @"__auto.blk.16.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %840 = "torch_c.from_builtin_tensor"(%839) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %841 = "util.global.load"() <{global = @"__auto.blk.16.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %842 = "torch_c.from_builtin_tensor"(%841) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %843 = "util.global.load"() <{global = @__auto.blk.17.attn_norm.weight}> : () -> tensor<4096xbf16>
    %844 = "torch_c.from_builtin_tensor"(%843) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %845 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %846 = "torch_c.from_builtin_tensor"(%845) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %847 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %848 = "torch_c.from_builtin_tensor"(%847) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %849 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %850 = "torch_c.from_builtin_tensor"(%849) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %851 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %852 = "torch_c.from_builtin_tensor"(%851) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %853 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %854 = "torch_c.from_builtin_tensor"(%853) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %855 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %856 = "torch_c.from_builtin_tensor"(%855) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %857 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %858 = "torch_c.from_builtin_tensor"(%857) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %859 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %860 = "torch_c.from_builtin_tensor"(%859) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %861 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %862 = "torch_c.from_builtin_tensor"(%861) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %863 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %864 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %865 = "util.global.load"() <{global = @__auto.blk.17.attn_scale}> : () -> tensor<f32>
    %866 = "torch_c.from_builtin_tensor"(%865) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %867 = "util.global.load"() <{global = @"__auto.blk.17.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %868 = "torch_c.from_builtin_tensor"(%867) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %869 = "util.global.load"() <{global = @"__auto.blk.17.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %870 = "torch_c.from_builtin_tensor"(%869) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %871 = "util.global.load"() <{global = @__auto.blk.17.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %872 = "torch_c.from_builtin_tensor"(%871) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %873 = "util.global.load"() <{global = @"__auto.blk.17.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %874 = "torch_c.from_builtin_tensor"(%873) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %875 = "util.global.load"() <{global = @"__auto.blk.17.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %876 = "torch_c.from_builtin_tensor"(%875) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %877 = "util.global.load"() <{global = @"__auto.blk.17.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %878 = "torch_c.from_builtin_tensor"(%877) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %879 = "util.global.load"() <{global = @"__auto.blk.17.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %880 = "torch_c.from_builtin_tensor"(%879) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %881 = "util.global.load"() <{global = @"__auto.blk.17.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %882 = "torch_c.from_builtin_tensor"(%881) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %883 = "util.global.load"() <{global = @"__auto.blk.17.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %884 = "torch_c.from_builtin_tensor"(%883) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %885 = "util.global.load"() <{global = @__auto.blk.18.attn_norm.weight}> : () -> tensor<4096xbf16>
    %886 = "torch_c.from_builtin_tensor"(%885) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %887 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %888 = "torch_c.from_builtin_tensor"(%887) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %889 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %890 = "torch_c.from_builtin_tensor"(%889) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %891 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %892 = "torch_c.from_builtin_tensor"(%891) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %893 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %894 = "torch_c.from_builtin_tensor"(%893) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %895 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %896 = "torch_c.from_builtin_tensor"(%895) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %897 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %898 = "torch_c.from_builtin_tensor"(%897) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %899 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %900 = "torch_c.from_builtin_tensor"(%899) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %901 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %902 = "torch_c.from_builtin_tensor"(%901) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %903 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %904 = "torch_c.from_builtin_tensor"(%903) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %905 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %906 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %907 = "util.global.load"() <{global = @__auto.blk.18.attn_scale}> : () -> tensor<f32>
    %908 = "torch_c.from_builtin_tensor"(%907) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %909 = "util.global.load"() <{global = @"__auto.blk.18.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %910 = "torch_c.from_builtin_tensor"(%909) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %911 = "util.global.load"() <{global = @"__auto.blk.18.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %912 = "torch_c.from_builtin_tensor"(%911) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %913 = "util.global.load"() <{global = @__auto.blk.18.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %914 = "torch_c.from_builtin_tensor"(%913) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %915 = "util.global.load"() <{global = @"__auto.blk.18.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %916 = "torch_c.from_builtin_tensor"(%915) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %917 = "util.global.load"() <{global = @"__auto.blk.18.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %918 = "torch_c.from_builtin_tensor"(%917) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %919 = "util.global.load"() <{global = @"__auto.blk.18.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %920 = "torch_c.from_builtin_tensor"(%919) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %921 = "util.global.load"() <{global = @"__auto.blk.18.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %922 = "torch_c.from_builtin_tensor"(%921) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %923 = "util.global.load"() <{global = @"__auto.blk.18.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %924 = "torch_c.from_builtin_tensor"(%923) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %925 = "util.global.load"() <{global = @"__auto.blk.18.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %926 = "torch_c.from_builtin_tensor"(%925) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %927 = "util.global.load"() <{global = @__auto.blk.19.attn_norm.weight}> : () -> tensor<4096xbf16>
    %928 = "torch_c.from_builtin_tensor"(%927) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %929 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %930 = "torch_c.from_builtin_tensor"(%929) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %931 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %932 = "torch_c.from_builtin_tensor"(%931) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %933 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %934 = "torch_c.from_builtin_tensor"(%933) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %935 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %936 = "torch_c.from_builtin_tensor"(%935) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %937 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %938 = "torch_c.from_builtin_tensor"(%937) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %939 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %940 = "torch_c.from_builtin_tensor"(%939) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %941 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %942 = "torch_c.from_builtin_tensor"(%941) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %943 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %944 = "torch_c.from_builtin_tensor"(%943) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %945 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %946 = "torch_c.from_builtin_tensor"(%945) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %947 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %948 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %949 = "util.global.load"() <{global = @__auto.blk.19.attn_scale}> : () -> tensor<f32>
    %950 = "torch_c.from_builtin_tensor"(%949) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %951 = "util.global.load"() <{global = @"__auto.blk.19.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %952 = "torch_c.from_builtin_tensor"(%951) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %953 = "util.global.load"() <{global = @"__auto.blk.19.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %954 = "torch_c.from_builtin_tensor"(%953) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %955 = "util.global.load"() <{global = @__auto.blk.19.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %956 = "torch_c.from_builtin_tensor"(%955) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %957 = "util.global.load"() <{global = @"__auto.blk.19.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %958 = "torch_c.from_builtin_tensor"(%957) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %959 = "util.global.load"() <{global = @"__auto.blk.19.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %960 = "torch_c.from_builtin_tensor"(%959) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %961 = "util.global.load"() <{global = @"__auto.blk.19.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %962 = "torch_c.from_builtin_tensor"(%961) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %963 = "util.global.load"() <{global = @"__auto.blk.19.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %964 = "torch_c.from_builtin_tensor"(%963) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %965 = "util.global.load"() <{global = @"__auto.blk.19.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %966 = "torch_c.from_builtin_tensor"(%965) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %967 = "util.global.load"() <{global = @"__auto.blk.19.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %968 = "torch_c.from_builtin_tensor"(%967) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %969 = "util.global.load"() <{global = @__auto.blk.20.attn_norm.weight}> : () -> tensor<4096xbf16>
    %970 = "torch_c.from_builtin_tensor"(%969) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %971 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %972 = "torch_c.from_builtin_tensor"(%971) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %973 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %974 = "torch_c.from_builtin_tensor"(%973) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %975 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %976 = "torch_c.from_builtin_tensor"(%975) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %977 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %978 = "torch_c.from_builtin_tensor"(%977) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %979 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %980 = "torch_c.from_builtin_tensor"(%979) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %981 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %982 = "torch_c.from_builtin_tensor"(%981) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %983 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %984 = "torch_c.from_builtin_tensor"(%983) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %985 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %986 = "torch_c.from_builtin_tensor"(%985) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %987 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %988 = "torch_c.from_builtin_tensor"(%987) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %989 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %990 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %991 = "util.global.load"() <{global = @__auto.blk.20.attn_scale}> : () -> tensor<f32>
    %992 = "torch_c.from_builtin_tensor"(%991) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %993 = "util.global.load"() <{global = @"__auto.blk.20.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %994 = "torch_c.from_builtin_tensor"(%993) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %995 = "util.global.load"() <{global = @"__auto.blk.20.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %996 = "torch_c.from_builtin_tensor"(%995) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %997 = "util.global.load"() <{global = @__auto.blk.20.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %998 = "torch_c.from_builtin_tensor"(%997) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %999 = "util.global.load"() <{global = @"__auto.blk.20.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1000 = "torch_c.from_builtin_tensor"(%999) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1001 = "util.global.load"() <{global = @"__auto.blk.20.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1002 = "torch_c.from_builtin_tensor"(%1001) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1003 = "util.global.load"() <{global = @"__auto.blk.20.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1004 = "torch_c.from_builtin_tensor"(%1003) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1005 = "util.global.load"() <{global = @"__auto.blk.20.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1006 = "torch_c.from_builtin_tensor"(%1005) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1007 = "util.global.load"() <{global = @"__auto.blk.20.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1008 = "torch_c.from_builtin_tensor"(%1007) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1009 = "util.global.load"() <{global = @"__auto.blk.20.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1010 = "torch_c.from_builtin_tensor"(%1009) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1011 = "util.global.load"() <{global = @__auto.blk.21.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1012 = "torch_c.from_builtin_tensor"(%1011) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1013 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1014 = "torch_c.from_builtin_tensor"(%1013) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1015 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1016 = "torch_c.from_builtin_tensor"(%1015) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1017 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1018 = "torch_c.from_builtin_tensor"(%1017) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1019 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1020 = "torch_c.from_builtin_tensor"(%1019) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1021 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1022 = "torch_c.from_builtin_tensor"(%1021) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1023 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1024 = "torch_c.from_builtin_tensor"(%1023) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1025 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1026 = "torch_c.from_builtin_tensor"(%1025) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1027 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1028 = "torch_c.from_builtin_tensor"(%1027) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1029 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1030 = "torch_c.from_builtin_tensor"(%1029) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1031 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1032 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1033 = "util.global.load"() <{global = @__auto.blk.21.attn_scale}> : () -> tensor<f32>
    %1034 = "torch_c.from_builtin_tensor"(%1033) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1035 = "util.global.load"() <{global = @"__auto.blk.21.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1036 = "torch_c.from_builtin_tensor"(%1035) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1037 = "util.global.load"() <{global = @"__auto.blk.21.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1038 = "torch_c.from_builtin_tensor"(%1037) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1039 = "util.global.load"() <{global = @__auto.blk.21.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1040 = "torch_c.from_builtin_tensor"(%1039) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1041 = "util.global.load"() <{global = @"__auto.blk.21.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1042 = "torch_c.from_builtin_tensor"(%1041) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1043 = "util.global.load"() <{global = @"__auto.blk.21.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1044 = "torch_c.from_builtin_tensor"(%1043) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1045 = "util.global.load"() <{global = @"__auto.blk.21.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1046 = "torch_c.from_builtin_tensor"(%1045) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1047 = "util.global.load"() <{global = @"__auto.blk.21.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1048 = "torch_c.from_builtin_tensor"(%1047) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1049 = "util.global.load"() <{global = @"__auto.blk.21.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1050 = "torch_c.from_builtin_tensor"(%1049) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1051 = "util.global.load"() <{global = @"__auto.blk.21.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1052 = "torch_c.from_builtin_tensor"(%1051) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1053 = "util.global.load"() <{global = @__auto.blk.22.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1054 = "torch_c.from_builtin_tensor"(%1053) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1055 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1056 = "torch_c.from_builtin_tensor"(%1055) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1057 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1058 = "torch_c.from_builtin_tensor"(%1057) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1059 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1060 = "torch_c.from_builtin_tensor"(%1059) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1061 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1062 = "torch_c.from_builtin_tensor"(%1061) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1063 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1064 = "torch_c.from_builtin_tensor"(%1063) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1065 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1066 = "torch_c.from_builtin_tensor"(%1065) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1067 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1068 = "torch_c.from_builtin_tensor"(%1067) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1069 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1070 = "torch_c.from_builtin_tensor"(%1069) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1071 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1072 = "torch_c.from_builtin_tensor"(%1071) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1073 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1074 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1075 = "util.global.load"() <{global = @__auto.blk.22.attn_scale}> : () -> tensor<f32>
    %1076 = "torch_c.from_builtin_tensor"(%1075) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1077 = "util.global.load"() <{global = @"__auto.blk.22.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1078 = "torch_c.from_builtin_tensor"(%1077) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1079 = "util.global.load"() <{global = @"__auto.blk.22.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1080 = "torch_c.from_builtin_tensor"(%1079) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1081 = "util.global.load"() <{global = @__auto.blk.22.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1082 = "torch_c.from_builtin_tensor"(%1081) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1083 = "util.global.load"() <{global = @"__auto.blk.22.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1084 = "torch_c.from_builtin_tensor"(%1083) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1085 = "util.global.load"() <{global = @"__auto.blk.22.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1086 = "torch_c.from_builtin_tensor"(%1085) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1087 = "util.global.load"() <{global = @"__auto.blk.22.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1088 = "torch_c.from_builtin_tensor"(%1087) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1089 = "util.global.load"() <{global = @"__auto.blk.22.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1090 = "torch_c.from_builtin_tensor"(%1089) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1091 = "util.global.load"() <{global = @"__auto.blk.22.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1092 = "torch_c.from_builtin_tensor"(%1091) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1093 = "util.global.load"() <{global = @"__auto.blk.22.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1094 = "torch_c.from_builtin_tensor"(%1093) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1095 = "util.global.load"() <{global = @__auto.blk.23.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1096 = "torch_c.from_builtin_tensor"(%1095) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1097 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1098 = "torch_c.from_builtin_tensor"(%1097) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1099 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1100 = "torch_c.from_builtin_tensor"(%1099) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1101 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1102 = "torch_c.from_builtin_tensor"(%1101) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1103 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1104 = "torch_c.from_builtin_tensor"(%1103) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1105 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1106 = "torch_c.from_builtin_tensor"(%1105) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1107 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1108 = "torch_c.from_builtin_tensor"(%1107) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1109 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1110 = "torch_c.from_builtin_tensor"(%1109) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1111 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1112 = "torch_c.from_builtin_tensor"(%1111) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1113 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1114 = "torch_c.from_builtin_tensor"(%1113) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1115 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1116 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1117 = "util.global.load"() <{global = @__auto.blk.23.attn_scale}> : () -> tensor<f32>
    %1118 = "torch_c.from_builtin_tensor"(%1117) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1119 = "util.global.load"() <{global = @"__auto.blk.23.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1120 = "torch_c.from_builtin_tensor"(%1119) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1121 = "util.global.load"() <{global = @"__auto.blk.23.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1122 = "torch_c.from_builtin_tensor"(%1121) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1123 = "util.global.load"() <{global = @__auto.blk.23.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1124 = "torch_c.from_builtin_tensor"(%1123) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1125 = "util.global.load"() <{global = @"__auto.blk.23.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1126 = "torch_c.from_builtin_tensor"(%1125) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1127 = "util.global.load"() <{global = @"__auto.blk.23.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1128 = "torch_c.from_builtin_tensor"(%1127) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1129 = "util.global.load"() <{global = @"__auto.blk.23.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1130 = "torch_c.from_builtin_tensor"(%1129) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1131 = "util.global.load"() <{global = @"__auto.blk.23.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1132 = "torch_c.from_builtin_tensor"(%1131) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1133 = "util.global.load"() <{global = @"__auto.blk.23.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1134 = "torch_c.from_builtin_tensor"(%1133) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1135 = "util.global.load"() <{global = @"__auto.blk.23.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1136 = "torch_c.from_builtin_tensor"(%1135) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1137 = "util.global.load"() <{global = @__auto.blk.24.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1138 = "torch_c.from_builtin_tensor"(%1137) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1139 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1140 = "torch_c.from_builtin_tensor"(%1139) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1141 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1142 = "torch_c.from_builtin_tensor"(%1141) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1143 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1144 = "torch_c.from_builtin_tensor"(%1143) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1145 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1146 = "torch_c.from_builtin_tensor"(%1145) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1147 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1148 = "torch_c.from_builtin_tensor"(%1147) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1149 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1150 = "torch_c.from_builtin_tensor"(%1149) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1151 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1152 = "torch_c.from_builtin_tensor"(%1151) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1153 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1154 = "torch_c.from_builtin_tensor"(%1153) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1155 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1156 = "torch_c.from_builtin_tensor"(%1155) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1157 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1158 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1159 = "util.global.load"() <{global = @__auto.blk.24.attn_scale}> : () -> tensor<f32>
    %1160 = "torch_c.from_builtin_tensor"(%1159) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1161 = "util.global.load"() <{global = @"__auto.blk.24.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1162 = "torch_c.from_builtin_tensor"(%1161) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1163 = "util.global.load"() <{global = @"__auto.blk.24.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1164 = "torch_c.from_builtin_tensor"(%1163) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1165 = "util.global.load"() <{global = @__auto.blk.24.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1166 = "torch_c.from_builtin_tensor"(%1165) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1167 = "util.global.load"() <{global = @"__auto.blk.24.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1168 = "torch_c.from_builtin_tensor"(%1167) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1169 = "util.global.load"() <{global = @"__auto.blk.24.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1170 = "torch_c.from_builtin_tensor"(%1169) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1171 = "util.global.load"() <{global = @"__auto.blk.24.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1172 = "torch_c.from_builtin_tensor"(%1171) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1173 = "util.global.load"() <{global = @"__auto.blk.24.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1174 = "torch_c.from_builtin_tensor"(%1173) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1175 = "util.global.load"() <{global = @"__auto.blk.24.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1176 = "torch_c.from_builtin_tensor"(%1175) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1177 = "util.global.load"() <{global = @"__auto.blk.24.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1178 = "torch_c.from_builtin_tensor"(%1177) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1179 = "util.global.load"() <{global = @__auto.blk.25.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1180 = "torch_c.from_builtin_tensor"(%1179) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1181 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1182 = "torch_c.from_builtin_tensor"(%1181) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1183 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1184 = "torch_c.from_builtin_tensor"(%1183) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1185 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1186 = "torch_c.from_builtin_tensor"(%1185) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1187 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1188 = "torch_c.from_builtin_tensor"(%1187) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1189 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1190 = "torch_c.from_builtin_tensor"(%1189) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1191 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1192 = "torch_c.from_builtin_tensor"(%1191) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1193 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1194 = "torch_c.from_builtin_tensor"(%1193) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1195 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1196 = "torch_c.from_builtin_tensor"(%1195) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1197 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1198 = "torch_c.from_builtin_tensor"(%1197) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1199 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1200 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1201 = "util.global.load"() <{global = @__auto.blk.25.attn_scale}> : () -> tensor<f32>
    %1202 = "torch_c.from_builtin_tensor"(%1201) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1203 = "util.global.load"() <{global = @"__auto.blk.25.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1204 = "torch_c.from_builtin_tensor"(%1203) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1205 = "util.global.load"() <{global = @"__auto.blk.25.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1206 = "torch_c.from_builtin_tensor"(%1205) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1207 = "util.global.load"() <{global = @__auto.blk.25.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1208 = "torch_c.from_builtin_tensor"(%1207) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1209 = "util.global.load"() <{global = @"__auto.blk.25.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1210 = "torch_c.from_builtin_tensor"(%1209) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1211 = "util.global.load"() <{global = @"__auto.blk.25.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1212 = "torch_c.from_builtin_tensor"(%1211) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1213 = "util.global.load"() <{global = @"__auto.blk.25.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1214 = "torch_c.from_builtin_tensor"(%1213) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1215 = "util.global.load"() <{global = @"__auto.blk.25.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1216 = "torch_c.from_builtin_tensor"(%1215) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1217 = "util.global.load"() <{global = @"__auto.blk.25.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1218 = "torch_c.from_builtin_tensor"(%1217) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1219 = "util.global.load"() <{global = @"__auto.blk.25.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1220 = "torch_c.from_builtin_tensor"(%1219) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1221 = "util.global.load"() <{global = @__auto.blk.26.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1222 = "torch_c.from_builtin_tensor"(%1221) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1223 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1224 = "torch_c.from_builtin_tensor"(%1223) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1225 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1226 = "torch_c.from_builtin_tensor"(%1225) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1227 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1228 = "torch_c.from_builtin_tensor"(%1227) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1229 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1230 = "torch_c.from_builtin_tensor"(%1229) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1231 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1232 = "torch_c.from_builtin_tensor"(%1231) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1233 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1234 = "torch_c.from_builtin_tensor"(%1233) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1235 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1236 = "torch_c.from_builtin_tensor"(%1235) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1237 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1238 = "torch_c.from_builtin_tensor"(%1237) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1239 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1240 = "torch_c.from_builtin_tensor"(%1239) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1241 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1242 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1243 = "util.global.load"() <{global = @__auto.blk.26.attn_scale}> : () -> tensor<f32>
    %1244 = "torch_c.from_builtin_tensor"(%1243) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1245 = "util.global.load"() <{global = @"__auto.blk.26.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1246 = "torch_c.from_builtin_tensor"(%1245) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1247 = "util.global.load"() <{global = @"__auto.blk.26.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1248 = "torch_c.from_builtin_tensor"(%1247) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1249 = "util.global.load"() <{global = @__auto.blk.26.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1250 = "torch_c.from_builtin_tensor"(%1249) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1251 = "util.global.load"() <{global = @"__auto.blk.26.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1252 = "torch_c.from_builtin_tensor"(%1251) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1253 = "util.global.load"() <{global = @"__auto.blk.26.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1254 = "torch_c.from_builtin_tensor"(%1253) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1255 = "util.global.load"() <{global = @"__auto.blk.26.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1256 = "torch_c.from_builtin_tensor"(%1255) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1257 = "util.global.load"() <{global = @"__auto.blk.26.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1258 = "torch_c.from_builtin_tensor"(%1257) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1259 = "util.global.load"() <{global = @"__auto.blk.26.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1260 = "torch_c.from_builtin_tensor"(%1259) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1261 = "util.global.load"() <{global = @"__auto.blk.26.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1262 = "torch_c.from_builtin_tensor"(%1261) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1263 = "util.global.load"() <{global = @__auto.blk.27.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1264 = "torch_c.from_builtin_tensor"(%1263) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1265 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1266 = "torch_c.from_builtin_tensor"(%1265) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1267 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1268 = "torch_c.from_builtin_tensor"(%1267) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1269 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1270 = "torch_c.from_builtin_tensor"(%1269) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1271 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1272 = "torch_c.from_builtin_tensor"(%1271) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1273 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1274 = "torch_c.from_builtin_tensor"(%1273) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1275 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1276 = "torch_c.from_builtin_tensor"(%1275) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1277 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1278 = "torch_c.from_builtin_tensor"(%1277) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1279 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1280 = "torch_c.from_builtin_tensor"(%1279) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1281 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1282 = "torch_c.from_builtin_tensor"(%1281) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1283 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1284 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1285 = "util.global.load"() <{global = @__auto.blk.27.attn_scale}> : () -> tensor<f32>
    %1286 = "torch_c.from_builtin_tensor"(%1285) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1287 = "util.global.load"() <{global = @"__auto.blk.27.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1288 = "torch_c.from_builtin_tensor"(%1287) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1289 = "util.global.load"() <{global = @"__auto.blk.27.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1290 = "torch_c.from_builtin_tensor"(%1289) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1291 = "util.global.load"() <{global = @__auto.blk.27.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1292 = "torch_c.from_builtin_tensor"(%1291) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1293 = "util.global.load"() <{global = @"__auto.blk.27.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1294 = "torch_c.from_builtin_tensor"(%1293) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1295 = "util.global.load"() <{global = @"__auto.blk.27.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1296 = "torch_c.from_builtin_tensor"(%1295) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1297 = "util.global.load"() <{global = @"__auto.blk.27.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1298 = "torch_c.from_builtin_tensor"(%1297) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1299 = "util.global.load"() <{global = @"__auto.blk.27.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1300 = "torch_c.from_builtin_tensor"(%1299) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1301 = "util.global.load"() <{global = @"__auto.blk.27.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1302 = "torch_c.from_builtin_tensor"(%1301) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1303 = "util.global.load"() <{global = @"__auto.blk.27.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1304 = "torch_c.from_builtin_tensor"(%1303) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1305 = "util.global.load"() <{global = @__auto.blk.28.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1306 = "torch_c.from_builtin_tensor"(%1305) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1307 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1308 = "torch_c.from_builtin_tensor"(%1307) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1309 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1310 = "torch_c.from_builtin_tensor"(%1309) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1311 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1312 = "torch_c.from_builtin_tensor"(%1311) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1313 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1314 = "torch_c.from_builtin_tensor"(%1313) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1315 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1316 = "torch_c.from_builtin_tensor"(%1315) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1317 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1318 = "torch_c.from_builtin_tensor"(%1317) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1319 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1320 = "torch_c.from_builtin_tensor"(%1319) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1321 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1322 = "torch_c.from_builtin_tensor"(%1321) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1323 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1324 = "torch_c.from_builtin_tensor"(%1323) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1325 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1326 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1327 = "util.global.load"() <{global = @__auto.blk.28.attn_scale}> : () -> tensor<f32>
    %1328 = "torch_c.from_builtin_tensor"(%1327) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1329 = "util.global.load"() <{global = @"__auto.blk.28.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1330 = "torch_c.from_builtin_tensor"(%1329) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1331 = "util.global.load"() <{global = @"__auto.blk.28.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1332 = "torch_c.from_builtin_tensor"(%1331) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1333 = "util.global.load"() <{global = @__auto.blk.28.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1334 = "torch_c.from_builtin_tensor"(%1333) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1335 = "util.global.load"() <{global = @"__auto.blk.28.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1336 = "torch_c.from_builtin_tensor"(%1335) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1337 = "util.global.load"() <{global = @"__auto.blk.28.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1338 = "torch_c.from_builtin_tensor"(%1337) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1339 = "util.global.load"() <{global = @"__auto.blk.28.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1340 = "torch_c.from_builtin_tensor"(%1339) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1341 = "util.global.load"() <{global = @"__auto.blk.28.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1342 = "torch_c.from_builtin_tensor"(%1341) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1343 = "util.global.load"() <{global = @"__auto.blk.28.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1344 = "torch_c.from_builtin_tensor"(%1343) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1345 = "util.global.load"() <{global = @"__auto.blk.28.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1346 = "torch_c.from_builtin_tensor"(%1345) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1347 = "util.global.load"() <{global = @__auto.blk.29.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1348 = "torch_c.from_builtin_tensor"(%1347) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1349 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1350 = "torch_c.from_builtin_tensor"(%1349) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1351 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1352 = "torch_c.from_builtin_tensor"(%1351) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1353 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1354 = "torch_c.from_builtin_tensor"(%1353) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1355 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1356 = "torch_c.from_builtin_tensor"(%1355) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1357 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1358 = "torch_c.from_builtin_tensor"(%1357) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1359 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1360 = "torch_c.from_builtin_tensor"(%1359) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1361 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1362 = "torch_c.from_builtin_tensor"(%1361) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1363 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1364 = "torch_c.from_builtin_tensor"(%1363) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1365 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1366 = "torch_c.from_builtin_tensor"(%1365) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1367 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1368 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1369 = "util.global.load"() <{global = @__auto.blk.29.attn_scale}> : () -> tensor<f32>
    %1370 = "torch_c.from_builtin_tensor"(%1369) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1371 = "util.global.load"() <{global = @"__auto.blk.29.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1372 = "torch_c.from_builtin_tensor"(%1371) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1373 = "util.global.load"() <{global = @"__auto.blk.29.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1374 = "torch_c.from_builtin_tensor"(%1373) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1375 = "util.global.load"() <{global = @__auto.blk.29.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1376 = "torch_c.from_builtin_tensor"(%1375) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1377 = "util.global.load"() <{global = @"__auto.blk.29.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1378 = "torch_c.from_builtin_tensor"(%1377) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1379 = "util.global.load"() <{global = @"__auto.blk.29.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1380 = "torch_c.from_builtin_tensor"(%1379) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1381 = "util.global.load"() <{global = @"__auto.blk.29.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1382 = "torch_c.from_builtin_tensor"(%1381) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1383 = "util.global.load"() <{global = @"__auto.blk.29.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1384 = "torch_c.from_builtin_tensor"(%1383) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1385 = "util.global.load"() <{global = @"__auto.blk.29.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1386 = "torch_c.from_builtin_tensor"(%1385) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1387 = "util.global.load"() <{global = @"__auto.blk.29.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1388 = "torch_c.from_builtin_tensor"(%1387) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1389 = "util.global.load"() <{global = @__auto.blk.30.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1390 = "torch_c.from_builtin_tensor"(%1389) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1391 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1392 = "torch_c.from_builtin_tensor"(%1391) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1393 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1394 = "torch_c.from_builtin_tensor"(%1393) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1395 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1396 = "torch_c.from_builtin_tensor"(%1395) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1397 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1398 = "torch_c.from_builtin_tensor"(%1397) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1399 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1400 = "torch_c.from_builtin_tensor"(%1399) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1401 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1402 = "torch_c.from_builtin_tensor"(%1401) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1403 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1404 = "torch_c.from_builtin_tensor"(%1403) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1405 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1406 = "torch_c.from_builtin_tensor"(%1405) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1407 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1408 = "torch_c.from_builtin_tensor"(%1407) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1409 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1410 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1411 = "util.global.load"() <{global = @__auto.blk.30.attn_scale}> : () -> tensor<f32>
    %1412 = "torch_c.from_builtin_tensor"(%1411) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1413 = "util.global.load"() <{global = @"__auto.blk.30.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1414 = "torch_c.from_builtin_tensor"(%1413) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1415 = "util.global.load"() <{global = @"__auto.blk.30.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1416 = "torch_c.from_builtin_tensor"(%1415) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1417 = "util.global.load"() <{global = @__auto.blk.30.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1418 = "torch_c.from_builtin_tensor"(%1417) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1419 = "util.global.load"() <{global = @"__auto.blk.30.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1420 = "torch_c.from_builtin_tensor"(%1419) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1421 = "util.global.load"() <{global = @"__auto.blk.30.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1422 = "torch_c.from_builtin_tensor"(%1421) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1423 = "util.global.load"() <{global = @"__auto.blk.30.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1424 = "torch_c.from_builtin_tensor"(%1423) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1425 = "util.global.load"() <{global = @"__auto.blk.30.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1426 = "torch_c.from_builtin_tensor"(%1425) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1427 = "util.global.load"() <{global = @"__auto.blk.30.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1428 = "torch_c.from_builtin_tensor"(%1427) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1429 = "util.global.load"() <{global = @"__auto.blk.30.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1430 = "torch_c.from_builtin_tensor"(%1429) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1431 = "util.global.load"() <{global = @__auto.blk.31.attn_norm.weight}> : () -> tensor<4096xbf16>
    %1432 = "torch_c.from_builtin_tensor"(%1431) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1433 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %1434 = "torch_c.from_builtin_tensor"(%1433) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1435 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1436 = "torch_c.from_builtin_tensor"(%1435) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1437 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %1438 = "torch_c.from_builtin_tensor"(%1437) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1439 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %1440 = "torch_c.from_builtin_tensor"(%1439) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1441 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1442 = "torch_c.from_builtin_tensor"(%1441) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1443 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %1444 = "torch_c.from_builtin_tensor"(%1443) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1445 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %1446 = "torch_c.from_builtin_tensor"(%1445) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1447 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %1448 = "torch_c.from_builtin_tensor"(%1447) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %1449 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %1450 = "torch_c.from_builtin_tensor"(%1449) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1451 = "torch.vtensor.literal"() <{value = dense<0> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1452 = "torch.vtensor.literal"() <{value = dense<1> : tensor<si64>}> : () -> !torch.vtensor<[],si64>
    %1453 = "util.global.load"() <{global = @__auto.blk.31.attn_scale}> : () -> tensor<f32>
    %1454 = "torch_c.from_builtin_tensor"(%1453) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1455 = "util.global.load"() <{global = @"__auto.blk.31.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %1456 = "torch_c.from_builtin_tensor"(%1455) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1457 = "util.global.load"() <{global = @"__auto.blk.31.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %1458 = "torch_c.from_builtin_tensor"(%1457) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %1459 = "util.global.load"() <{global = @__auto.blk.31.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %1460 = "torch_c.from_builtin_tensor"(%1459) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1461 = "util.global.load"() <{global = @"__auto.blk.31.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %1462 = "torch_c.from_builtin_tensor"(%1461) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1463 = "util.global.load"() <{global = @"__auto.blk.31.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1464 = "torch_c.from_builtin_tensor"(%1463) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1465 = "util.global.load"() <{global = @"__auto.blk.31.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %1466 = "torch_c.from_builtin_tensor"(%1465) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1467 = "util.global.load"() <{global = @"__auto.blk.31.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %1468 = "torch_c.from_builtin_tensor"(%1467) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %1469 = "util.global.load"() <{global = @"__auto.blk.31.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %1470 = "torch_c.from_builtin_tensor"(%1469) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %1471 = "util.global.load"() <{global = @"__auto.blk.31.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %1472 = "torch_c.from_builtin_tensor"(%1471) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %1473 = "util.global.load"() <{global = @__auto.output_norm.weight}> : () -> tensor<4096xbf16>
    %1474 = "torch_c.from_builtin_tensor"(%1473) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %1475 = "util.global.load"() <{global = @__auto.output.weight}> : () -> tensor<128256x4096xbf16>
    %1476 = "torch_c.from_builtin_tensor"(%1475) : (tensor<128256x4096xbf16>) -> !torch.vtensor<[128256,4096],bf16>
    %1477 = "torch.copy.to_vtensor"(%arg66) : (!torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %1478 = "torch.symbolic_int"() <{max_val = 4095 : i64, min_val = 2 : i64, symbol_name = "s0"}> : () -> !torch.int
    %1479 = "torch.symbolic_int"() <{max_val = 9223372036854775807 : i64, min_val = 0 : i64, symbol_name = "s1"}> : () -> !torch.int
    "torch.bind_symbolic_shape"(%arg65, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    "torch.bind_symbolic_shape"(%1477, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %1480 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1481 = "torch.aten.size.int"(%arg65, %1480) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.int
    %1482 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1483 = "torch.aten.size.int"(%1477, %1482) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> !torch.int
    %1484 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1485 = "torch.aten.mul.int"(%1481, %1484) : (!torch.int, !torch.int) -> !torch.int
    %1486 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1487 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1488 = "torch.constant.none"() : () -> !torch.none
    %1489 = "torch.constant.none"() : () -> !torch.none
    %1490 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1491 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1492 = "torch.aten.arange.start_step"(%1486, %1485, %1487, %1488, %1489, %1490, %1491) : (!torch.int, !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%1492, %1478) <{shape_expressions = #map3}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %1493 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %1494 = "torch.aten.unsqueeze"(%arg63, %1493) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1495 = "torch.aten.ge.Tensor"(%1492, %1494) : (!torch.vtensor<[?],si64>, !torch.vtensor<[4,1],si64>) -> !torch.vtensor<[4,?],i1>
    "torch.bind_symbolic_shape"(%1495, %1478) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],i1>, !torch.int) -> ()
    %1496 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1497 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %1498 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1499 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1500 = "torch.constant.none"() : () -> !torch.none
    %1501 = "torch.aten.scalar_tensor"(%1496, %1497, %1498, %1499, %1500) : (!torch.int, !torch.int, !torch.int, !torch.Device, !torch.none) -> !torch.vtensor<[],f32>
    %1502 = "torch.constant.float"() <{value = 0xFFF0000000000000 : f64}> : () -> !torch.float
    %1503 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %1504 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1505 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1506 = "torch.constant.none"() : () -> !torch.none
    %1507 = "torch.aten.scalar_tensor"(%1502, %1503, %1504, %1505, %1506) : (!torch.float, !torch.int, !torch.int, !torch.Device, !torch.none) -> !torch.vtensor<[],f32>
    %1508 = "torch.aten.where.self"(%1495, %1507, %1501) : (!torch.vtensor<[4,?],i1>, !torch.vtensor<[],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?],f32>
    "torch.bind_symbolic_shape"(%1508, %1478) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],f32>, !torch.int) -> ()
    %1509 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1510 = "torch.prims.convert_element_type"(%1508, %1509) : (!torch.vtensor<[4,?],f32>, !torch.int) -> !torch.vtensor<[4,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1510, %1478) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],f8E4M3FNUZ>, !torch.int) -> ()
    %1511 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1512 = "torch.aten.unsqueeze"(%1510, %1511) : (!torch.vtensor<[4,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1512, %1478) <{shape_expressions = #map6}> : (!torch.vtensor<[4,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %1513 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1514 = "torch.aten.unsqueeze"(%1512, %1513) : (!torch.vtensor<[4,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1514, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %1515 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1516 = "torch.prims.convert_element_type"(%1514, %1515) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1516, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %1517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1518 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1519 = "torch.constant.none"() : () -> !torch.none
    %1520 = "torch.constant.none"() : () -> !torch.none
    %1521 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1522 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1523 = "torch.aten.arange.start"(%1517, %1518, %1519, %1520, %1521, %1522) : (!torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[1],si64>
    %1524 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1525 = "torch.aten.unsqueeze"(%1523, %1524) : (!torch.vtensor<[1],si64>, !torch.int) -> !torch.vtensor<[1,1],si64>
    %1526 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1527 = "torch.aten.unsqueeze"(%arg64, %1526) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1528 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1529 = "torch.aten.add.Tensor"(%1525, %1527, %1528) : (!torch.vtensor<[1,1],si64>, !torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1530 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %1531 = "torch.constant.none"() : () -> !torch.none
    %1532 = "torch.constant.none"() : () -> !torch.none
    %1533 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1534 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1535 = "torch.aten.arange"(%1530, %1531, %1532, %1533, %1534) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %1536 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1537 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1538 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1539 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1540 = "torch.constant.none"() : () -> !torch.none
    %1541 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1542 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1543 = "torch.aten.arange.start_step"(%1536, %1537, %1538, %1539, %1540, %1541, %1542) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %1544 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %1545 = "torch.prims.convert_element_type"(%1543, %1544) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %1546 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1547 = "torch.aten.div.Scalar"(%1545, %1546) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %1548 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %1549 = "torch.aten.pow.Scalar"(%1548, %1547) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1550 = "torch.aten.reciprocal"(%1549) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1551 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %1552 = "torch.aten.mul.Scalar"(%1550, %1551) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %1553 = "torch.aten.reciprocal"(%1552) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1554 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %1555 = "torch.aten.mul.Scalar"(%1553, %1554) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %1556 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %1557 = "torch.aten.gt.Scalar"(%1555, %1556) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %1558 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1559 = "torch.aten.div.Scalar"(%1552, %1558) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %1560 = "torch.aten.where.self"(%1557, %1559, %1552) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1561 = "torch.aten.reciprocal"(%1555) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1562 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %1563 = "torch.aten.mul.Scalar"(%1561, %1562) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %1564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1565 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1566 = "torch.aten.sub.Scalar"(%1563, %1564, %1565) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %1567 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %1568 = "torch.aten.div.Scalar"(%1566, %1567) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %1569 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1570 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1571 = "torch.aten.rsub.Scalar"(%1568, %1569, %1570) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %1572 = "torch.aten.mul.Tensor"(%1571, %1560) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1573 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1574 = "torch.aten.div.Scalar"(%1572, %1573) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %1575 = "torch.aten.mul.Tensor"(%1568, %1560) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1576 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1577 = "torch.aten.add.Tensor"(%1574, %1575, %1576) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %1578 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %1579 = "torch.aten.lt.Scalar"(%1555, %1578) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %1580 = "torch.aten.bitwise_not"(%1579) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %1581 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %1582 = "torch.aten.gt.Scalar"(%1555, %1581) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %1583 = "torch.aten.bitwise_not"(%1582) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %1584 = "torch.aten.mul.Tensor"(%1580, %1583) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %1585 = "torch.aten.where.self"(%1584, %1577, %1560) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %1586 = "torch.prim.ListConstruct"(%1585, %1585) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %1587 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %1588 = "torch.aten.cat"(%1586, %1587) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %1589 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %1590 = "torch.prims.convert_element_type"(%1535, %1589) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %1591 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %1592 = "torch.prims.convert_element_type"(%1588, %1591) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %1593 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %1594 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1595 = "torch.prim.ListConstruct"(%1593, %1594) : (!torch.int, !torch.int) -> !torch.list<int>
    %1596 = "torch.aten.view"(%1590, %1595) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %1597 = "torch.aten.mul.Tensor"(%1596, %1592) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %1598 = "torch.aten.cos"(%1597) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %1599 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %1600 = "torch.prims.convert_element_type"(%1598, %1599) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %1601 = "torch.aten.sin"(%1597) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %1602 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %1603 = "torch.prims.convert_element_type"(%1601, %1602) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %1604 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1605 = "torch.prim.ListConstruct"(%1604) : (!torch.int) -> !torch.list<int>
    %1606 = "torch.aten.view"(%1529, %1605) : (!torch.vtensor<[4,1],si64>, !torch.list<int>) -> !torch.vtensor<[4],si64>
    %1607 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1608 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1609 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1610 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1611 = "torch.aten.slice.Tensor"(%1600, %1607, %1608, %1609, %1610) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %1612 = "torch.prim.ListConstruct"(%1606) : (!torch.vtensor<[4],si64>) -> !torch.list<optional<vtensor>>
    %1613 = "torch.aten.index.Tensor"(%1611, %1612) : (!torch.vtensor<[131072,128],bf16>, !torch.list<optional<vtensor>>) -> !torch.vtensor<[4,128],bf16>
    %1614 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1615 = "torch.prim.ListConstruct"(%1614) : (!torch.int) -> !torch.list<int>
    %1616 = "torch.aten.view"(%1529, %1615) : (!torch.vtensor<[4,1],si64>, !torch.list<int>) -> !torch.vtensor<[4],si64>
    %1617 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1618 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1619 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1621 = "torch.aten.slice.Tensor"(%1603, %1617, %1618, %1619, %1620) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %1622 = "torch.prim.ListConstruct"(%1616) : (!torch.vtensor<[4],si64>) -> !torch.list<optional<vtensor>>
    %1623 = "torch.aten.index.Tensor"(%1621, %1622) : (!torch.vtensor<[131072,128],bf16>, !torch.list<optional<vtensor>>) -> !torch.vtensor<[4,128],bf16>
    %1624 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1625 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1626 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1627 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1628 = "torch.aten.slice.Tensor"(%1613, %1624, %1625, %1626, %1627) : (!torch.vtensor<[4,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,128],bf16>
    %1629 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1630 = "torch.aten.unsqueeze"(%1628, %1629) : (!torch.vtensor<[4,128],bf16>, !torch.int) -> !torch.vtensor<[4,1,128],bf16>
    %1631 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1632 = "torch.aten.unsqueeze"(%1630, %1631) : (!torch.vtensor<[4,1,128],bf16>, !torch.int) -> !torch.vtensor<[4,1,1,128],bf16>
    %1633 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %1634 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1635 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1636 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1637 = "torch.aten.slice.Tensor"(%1632, %1633, %1634, %1635, %1636) : (!torch.vtensor<[4,1,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,1,1,128],bf16>
    %1638 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1639 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1640 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1641 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1642 = "torch.aten.slice.Tensor"(%1623, %1638, %1639, %1640, %1641) : (!torch.vtensor<[4,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,128],bf16>
    %1643 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1644 = "torch.aten.unsqueeze"(%1642, %1643) : (!torch.vtensor<[4,128],bf16>, !torch.int) -> !torch.vtensor<[4,1,128],bf16>
    %1645 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1646 = "torch.aten.unsqueeze"(%1644, %1645) : (!torch.vtensor<[4,1,128],bf16>, !torch.int) -> !torch.vtensor<[4,1,1,128],bf16>
    %1647 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %1648 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1649 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1650 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1651 = "torch.aten.slice.Tensor"(%1646, %1647, %1648, %1649, %1650) : (!torch.vtensor<[4,1,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,1,1,128],bf16>
    %1652 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %1653 = "torch.prims.convert_element_type"(%128, %1652) : (!torch.vtensor<[128256,4096],bf16>, !torch.int) -> !torch.vtensor<[128256,4096],bf16>
    %1654 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %1655 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1656 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1657 = "torch.aten.embedding"(%1653, %arg62, %1654, %1655, %1656) : (!torch.vtensor<[128256,4096],bf16>, !torch.vtensor<[4,1],si64>, !torch.int, !torch.bool, !torch.bool) -> !torch.vtensor<[4,1,4096],bf16>
    %1658 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %1659 = "torch.prims.convert_element_type"(%1657, %1658) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %1660 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1661 = "torch.aten.pow.Tensor_Scalar"(%1659, %1660) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %1662 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %1663 = "torch.prim.ListConstruct"(%1662) : (!torch.int) -> !torch.list<int>
    %1664 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %1665 = "torch.constant.none"() : () -> !torch.none
    %1666 = "torch.aten.mean.dim"(%1661, %1663, %1664, %1665) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %1667 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %1668 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1669 = "torch.aten.add.Scalar"(%1666, %1667, %1668) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %1670 = "torch.aten.rsqrt"(%1669) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %1671 = "torch.aten.mul.Tensor"(%1659, %1670) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %1672 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %1673 = "torch.prims.convert_element_type"(%1671, %1672) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %1674 = "torch.aten.mul.Tensor"(%130, %1673) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %1675 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %1676 = "torch.prims.convert_element_type"(%1674, %1675) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %1677 = "torch.aten.div.Tensor"(%1676, %132) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %1678 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %1679 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %1680 = "torch.aten.clamp"(%1677, %1678, %1679) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %1681 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1682 = "torch.prims.convert_element_type"(%1680, %1681) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %1683 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1684 = "torch.aten.unsqueeze"(%134, %1683) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %1685 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1686 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %1687 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %1688 = "torch.prim.ListConstruct"(%1685, %1686, %1687) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1689 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1690 = "torch.aten.expand"(%1684, %1688, %1689) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %1691 = "torch_c.to_builtin_tensor"(%1682) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %1692 = "torch_c.to_builtin_tensor"(%1690) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %1693 = "util.call"(%1691, %1692) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %1694 = "torch_c.from_builtin_tensor"(%1693) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %1695 = "torch.aten.div.Tensor"(%1694, %136) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %1696 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %1697 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %1698 = "torch.aten.clamp"(%1695, %1696, %1697) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %1699 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1700 = "torch.prims.convert_element_type"(%1698, %1699) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %1701 = "torch.aten.div.Tensor"(%1676, %138) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %1702 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %1703 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %1704 = "torch.aten.clamp"(%1701, %1702, %1703) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %1705 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1706 = "torch.prims.convert_element_type"(%1704, %1705) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %1707 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1708 = "torch.aten.unsqueeze"(%140, %1707) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %1709 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1710 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %1711 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %1712 = "torch.prim.ListConstruct"(%1709, %1710, %1711) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1713 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1714 = "torch.aten.expand"(%1708, %1712, %1713) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1715 = "torch_c.to_builtin_tensor"(%1706) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %1716 = "torch_c.to_builtin_tensor"(%1714) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1717 = "util.call"(%1715, %1716) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %1718 = "torch_c.from_builtin_tensor"(%1717) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %1719 = "torch.aten.div.Tensor"(%1718, %142) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %1720 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %1721 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %1722 = "torch.aten.clamp"(%1719, %1720, %1721) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %1723 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1724 = "torch.prims.convert_element_type"(%1722, %1723) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %1725 = "torch.aten.div.Tensor"(%1676, %144) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %1726 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %1727 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %1728 = "torch.aten.clamp"(%1725, %1726, %1727) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %1729 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1730 = "torch.prims.convert_element_type"(%1728, %1729) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %1731 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1732 = "torch.aten.unsqueeze"(%146, %1731) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %1733 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1734 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %1735 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %1736 = "torch.prim.ListConstruct"(%1733, %1734, %1735) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1737 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1738 = "torch.aten.expand"(%1732, %1736, %1737) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1739 = "torch_c.to_builtin_tensor"(%1730) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %1740 = "torch_c.to_builtin_tensor"(%1738) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1741 = "util.call"(%1739, %1740) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %1742 = "torch_c.from_builtin_tensor"(%1741) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %1743 = "torch.aten.div.Tensor"(%1742, %148) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %1744 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %1745 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %1746 = "torch.aten.clamp"(%1743, %1744, %1745) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %1747 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %1748 = "torch.prims.convert_element_type"(%1746, %1747) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %1749 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1751 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1752 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1753 = "torch.prim.ListConstruct"(%1749, %1750, %1751, %1752) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1754 = "torch.aten.view"(%1700, %1753) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %1755 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1756 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1757 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1758 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1759 = "torch.prim.ListConstruct"(%1755, %1756, %1757, %1758) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1760 = "torch.aten.view"(%1724, %1759) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %1761 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1763 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1764 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1765 = "torch.prim.ListConstruct"(%1761, %1762, %1763, %1764) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1766 = "torch.aten.view"(%1748, %1765) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %1767 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1768 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1769 = "torch.aten.transpose.int"(%1754, %1767, %1768) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %1770 = "torch.aten.mul.Tensor"(%1769, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %1771 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %1772 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1773 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %1774 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1775 = "torch.aten.slice.Tensor"(%1769, %1771, %1772, %1773, %1774) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %1776 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %1777 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %1778 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1780 = "torch.aten.slice.Tensor"(%1769, %1776, %1777, %1778, %1779) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %1781 = "torch.aten.neg"(%1780) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %1782 = "torch.prim.ListConstruct"(%1781, %1775) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %1783 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %1784 = "torch.aten.cat"(%1782, %1783) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %1785 = "torch.aten.mul.Tensor"(%1784, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %1786 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1787 = "torch.aten.add.Tensor"(%1770, %1785, %1786) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %1788 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1789 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1790 = "torch.aten.transpose.int"(%1787, %1788, %1789) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %1791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1792 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1793 = "torch.aten.transpose.int"(%1760, %1791, %1792) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %1794 = "torch.aten.mul.Tensor"(%1793, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %1795 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %1796 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1797 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %1798 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1799 = "torch.aten.slice.Tensor"(%1793, %1795, %1796, %1797, %1798) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %1800 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %1801 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %1802 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1803 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1804 = "torch.aten.slice.Tensor"(%1793, %1800, %1801, %1802, %1803) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %1805 = "torch.aten.neg"(%1804) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %1806 = "torch.prim.ListConstruct"(%1805, %1799) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %1807 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %1808 = "torch.aten.cat"(%1806, %1807) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %1809 = "torch.aten.mul.Tensor"(%1808, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %1810 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1811 = "torch.aten.add.Tensor"(%1794, %1809, %1810) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %1812 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1813 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1814 = "torch.aten.transpose.int"(%1811, %1812, %1813) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %1815 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1816 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1817 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1818 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1819 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1820 = "torch.prim.ListConstruct"(%1483, %1815, %1816, %1817, %1818, %1819) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1821 = "torch.aten.view"(%1477, %1820) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1821, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1822 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1823 = "torch.aten.floor_divide.Scalar"(%arg64, %1822) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %1824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1825 = "torch.aten.unsqueeze"(%1823, %1824) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1826 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1827 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1828 = "torch.aten.gather"(%arg65, %1826, %1825, %1827) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %1829 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1830 = "torch.aten.remainder.Scalar"(%arg64, %1829) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %1831 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1832 = "torch.aten.unsqueeze"(%1830, %1831) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1833 = "torch.constant.none"() : () -> !torch.none
    %1834 = "torch.aten.clone"(%149, %1833) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %1835 = "torch.aten.detach"(%1834) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %1836 = "torch.aten.detach"(%1835) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %1837 = "torch.aten.detach"(%1836) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %1838 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1839 = "torch.aten.unsqueeze"(%1837, %1838) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %1840 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1841 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1842 = "torch.prim.ListConstruct"(%1840, %1841) : (!torch.int, !torch.int) -> !torch.list<int>
    %1843 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1844 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1845 = "torch.prim.ListConstruct"(%1843, %1844) : (!torch.int, !torch.int) -> !torch.list<int>
    %1846 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1847 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1848 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1849 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1850 = "torch.aten.empty_strided"(%1842, %1845, %1846, %1847, %1848, %1849) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %1851 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1852 = "torch.aten.fill.Scalar"(%1850, %1851) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1853 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1854 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1855 = "torch.prim.ListConstruct"(%1853, %1854) : (!torch.int, !torch.int) -> !torch.list<int>
    %1856 = "torch.aten.repeat"(%1839, %1855) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %1857 = "torch.prim.ListConstruct"(%1828, %1852, %1856, %1832) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %1858 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1859 = "torch.aten.index_put"(%1821, %1857, %1814, %1858) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1859, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1860 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %1861 = "torch.prim.ListConstruct"(%1483, %1860) : (!torch.int, !torch.int) -> !torch.list<int>
    %1862 = "torch.aten.view"(%1859, %1861) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1862, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %1863 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1864 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1865 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1866 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1867 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1868 = "torch.prim.ListConstruct"(%1483, %1863, %1864, %1865, %1866, %1867) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1869 = "torch.aten.view"(%1862, %1868) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1869, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1870 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1871 = "torch.aten.floor_divide.Scalar"(%arg64, %1870) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %1872 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1873 = "torch.aten.unsqueeze"(%1871, %1872) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1874 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1875 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1876 = "torch.aten.gather"(%arg65, %1874, %1873, %1875) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %1877 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1878 = "torch.aten.remainder.Scalar"(%arg64, %1877) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %1879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1880 = "torch.aten.unsqueeze"(%1878, %1879) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1881 = "torch.constant.none"() : () -> !torch.none
    %1882 = "torch.aten.clone"(%150, %1881) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %1883 = "torch.aten.detach"(%1882) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %1884 = "torch.aten.detach"(%1883) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %1885 = "torch.aten.detach"(%1884) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %1886 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1887 = "torch.aten.unsqueeze"(%1885, %1886) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %1888 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1889 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1890 = "torch.prim.ListConstruct"(%1888, %1889) : (!torch.int, !torch.int) -> !torch.list<int>
    %1891 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1892 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1893 = "torch.prim.ListConstruct"(%1891, %1892) : (!torch.int, !torch.int) -> !torch.list<int>
    %1894 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1895 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1896 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %1897 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1898 = "torch.aten.empty_strided"(%1890, %1893, %1894, %1895, %1896, %1897) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %1899 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1900 = "torch.aten.fill.Scalar"(%1898, %1899) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %1901 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1902 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1903 = "torch.prim.ListConstruct"(%1901, %1902) : (!torch.int, !torch.int) -> !torch.list<int>
    %1904 = "torch.aten.repeat"(%1887, %1903) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %1905 = "torch.prim.ListConstruct"(%1876, %1900, %1904, %1880) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %1906 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1907 = "torch.aten.index_put"(%1869, %1905, %1766, %1906) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1907, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1908 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %1909 = "torch.prim.ListConstruct"(%1483, %1908) : (!torch.int, !torch.int) -> !torch.list<int>
    %1910 = "torch.aten.view"(%1907, %1909) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1910, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %1911 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1912 = "torch.aten.mul.int"(%1483, %1911) : (!torch.int, !torch.int) -> !torch.int
    %1913 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1914 = "torch.aten.mul.int"(%1912, %1913) : (!torch.int, !torch.int) -> !torch.int
    %1915 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %1916 = "torch.aten.mul.Scalar"(%arg65, %1915) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%1916, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %1917 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1918 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1919 = "torch.aten.add.Scalar"(%1916, %1917, %1918) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%1919, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %1920 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1921 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1922 = "torch.aten.add.Scalar"(%1919, %1920, %1921) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%1922, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %1923 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1924 = "torch.aten.mul.int"(%1923, %1481) : (!torch.int, !torch.int) -> !torch.int
    %1925 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %1926 = "torch.aten.view"(%1922, %1925) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%1926, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %1927 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1928 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %1929 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1930 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1931 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1932 = "torch.prim.ListConstruct"(%1483, %1927, %1928, %1929, %1930, %1931) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1933 = "torch.aten.view"(%1910, %1932) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1933, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1934 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1935 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1936 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1937 = "torch.prim.ListConstruct"(%1914, %1934, %1935, %1936) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1938 = "torch.aten.view"(%1933, %1937) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1938, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1939 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1940 = "torch.aten.index_select"(%1938, %1939, %1926) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1940, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1941 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1942 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1943 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1944 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1945 = "torch.prim.ListConstruct"(%1941, %1481, %1942, %1943, %1944) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1946 = "torch.aten.view"(%1940, %1945) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1946, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1947 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1948 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1949 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1950 = "torch.prim.ListConstruct"(%1947, %1485, %1948, %1949) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1951 = "torch.aten.view"(%1946, %1950) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1951, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1952 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1953 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1954 = "torch.aten.add.Scalar"(%1919, %1952, %1953) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%1954, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %1955 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %1956 = "torch.aten.view"(%1954, %1955) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%1956, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %1957 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1958 = "torch.aten.index_select"(%1938, %1957, %1956) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1958, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1960 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1961 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1962 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1963 = "torch.prim.ListConstruct"(%1959, %1481, %1960, %1961, %1962) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1964 = "torch.aten.view"(%1958, %1963) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1964, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1965 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1966 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1967 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1968 = "torch.prim.ListConstruct"(%1965, %1485, %1966, %1967) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1969 = "torch.aten.view"(%1964, %1968) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1969, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1970 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1971 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1972 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1973 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1974 = "torch.aten.slice.Tensor"(%1951, %1970, %1971, %1972, %1973) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1974, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1975 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1976 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1977 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %1978 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %1979 = "torch.aten.slice.Tensor"(%1969, %1975, %1976, %1977, %1978) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1979, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1980 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %1981 = "torch.aten.unsqueeze"(%1974, %1980) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1981, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1982 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1983 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %1984 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1985 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1986 = "torch.prim.ListConstruct"(%1982, %1485, %1983, %1984, %1985) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1987 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %1988 = "torch.aten.expand"(%1981, %1986, %1987) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1988, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1989 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %1990 = "torch.aten.clone"(%1988, %1989) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1990, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1991 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1992 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %1993 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %1994 = "torch.prim.ListConstruct"(%1991, %1485, %1992, %1993) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1995 = "torch.aten._unsafe_view"(%1990, %1994) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1995, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1996 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %1997 = "torch.aten.unsqueeze"(%1979, %1996) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%1997, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %1998 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %1999 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2000 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2001 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2002 = "torch.prim.ListConstruct"(%1998, %1485, %1999, %2000, %2001) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2003 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2004 = "torch.aten.expand"(%1997, %2002, %2003) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2004, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2005 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2006 = "torch.aten.clone"(%2004, %2005) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2006, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2007 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2008 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2009 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2010 = "torch.prim.ListConstruct"(%2007, %1485, %2008, %2009) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2011 = "torch.aten._unsafe_view"(%2006, %2010) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2011, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2012 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2013 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2014 = "torch.aten.transpose.int"(%1790, %2012, %2013) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2015 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2016 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2017 = "torch.aten.transpose.int"(%1995, %2015, %2016) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2017, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2018 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2019 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2020 = "torch.aten.transpose.int"(%2011, %2018, %2019) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2020, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2021 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2022 = "torch.aten.squeeze.dim"(%1516, %2021) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2022, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %2023 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2024 = "torch.aten.squeeze.dim"(%2022, %2023) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2024, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %2025 = "torch_c.to_builtin_tensor"(%2014) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %2026 = "tensor.cast"(%2025) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2027 = "torch_c.to_builtin_tensor"(%2017) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2028 = "torch_c.to_builtin_tensor"(%2020) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2029 = "torch_c.to_builtin_tensor"(%2024) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %2030 = "tensor.cast"(%2029) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %2031 = "torch_c.to_builtin_tensor"(%152) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %2032 = "util.call"(%2026, %2027, %2028, %2031, %2030) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %2033 = "tensor.cast"(%2032) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %2034 = "torch_c.from_builtin_tensor"(%2033) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %2035 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2036 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2037 = "torch.aten.transpose.int"(%2034, %2035, %2036) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %2038 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2039 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2040 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2041 = "torch.prim.ListConstruct"(%2038, %2039, %2040) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2042 = "torch.aten.view"(%2037, %2041) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %2043 = "torch.aten.div.Tensor"(%2042, %154) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2044 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2045 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2046 = "torch.aten.clamp"(%2043, %2044, %2045) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %2047 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2048 = "torch.prims.convert_element_type"(%2046, %2047) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2049 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2050 = "torch.aten.unsqueeze"(%156, %2049) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %2051 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2052 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2053 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2054 = "torch.prim.ListConstruct"(%2051, %2052, %2053) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2055 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2056 = "torch.aten.expand"(%2050, %2054, %2055) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2057 = "torch_c.to_builtin_tensor"(%2048) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2058 = "torch_c.to_builtin_tensor"(%2056) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2059 = "util.call"(%2057, %2058) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %2060 = "torch_c.from_builtin_tensor"(%2059) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %2061 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2062 = "torch.prims.convert_element_type"(%2060, %2061) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2063 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2064 = "torch.aten.add.Tensor"(%1657, %2062, %2063) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2065 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %2066 = "torch.prims.convert_element_type"(%2064, %2065) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2067 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2068 = "torch.aten.pow.Tensor_Scalar"(%2066, %2067) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2069 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2070 = "torch.prim.ListConstruct"(%2069) : (!torch.int) -> !torch.list<int>
    %2071 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %2072 = "torch.constant.none"() : () -> !torch.none
    %2073 = "torch.aten.mean.dim"(%2068, %2070, %2071, %2072) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %2074 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %2075 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2076 = "torch.aten.add.Scalar"(%2073, %2074, %2075) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %2077 = "torch.aten.rsqrt"(%2076) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %2078 = "torch.aten.mul.Tensor"(%2066, %2077) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2079 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2080 = "torch.prims.convert_element_type"(%2078, %2079) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2081 = "torch.aten.mul.Tensor"(%158, %2080) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %2082 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2083 = "torch.prims.convert_element_type"(%2081, %2082) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2084 = "torch.aten.div.Tensor"(%2083, %160) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2085 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2086 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2087 = "torch.aten.clamp"(%2084, %2085, %2086) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2088 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2089 = "torch.prims.convert_element_type"(%2087, %2088) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2090 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2091 = "torch.aten.unsqueeze"(%162, %2090) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %2092 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2093 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %2094 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2095 = "torch.prim.ListConstruct"(%2092, %2093, %2094) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2096 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2097 = "torch.aten.expand"(%2091, %2095, %2096) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2098 = "torch_c.to_builtin_tensor"(%2089) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2099 = "torch_c.to_builtin_tensor"(%2097) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2100 = "util.call"(%2098, %2099) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %2101 = "torch_c.from_builtin_tensor"(%2100) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %2102 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2103 = "torch.prims.convert_element_type"(%2101, %2102) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %2104 = "torch.aten.silu"(%2103) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %2105 = "torch.aten.div.Tensor"(%2083, %164) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2106 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2107 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2108 = "torch.aten.clamp"(%2105, %2106, %2107) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2109 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2110 = "torch.prims.convert_element_type"(%2108, %2109) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2111 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2112 = "torch.aten.unsqueeze"(%166, %2111) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %2113 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2114 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %2115 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2116 = "torch.prim.ListConstruct"(%2113, %2114, %2115) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2117 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2118 = "torch.aten.expand"(%2112, %2116, %2117) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2119 = "torch_c.to_builtin_tensor"(%2110) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2120 = "torch_c.to_builtin_tensor"(%2118) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2121 = "util.call"(%2119, %2120) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %2122 = "torch_c.from_builtin_tensor"(%2121) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %2123 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2124 = "torch.prims.convert_element_type"(%2122, %2123) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %2125 = "torch.aten.mul.Tensor"(%2104, %2124) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %2126 = "torch.aten.div.Tensor"(%2125, %168) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %2127 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2128 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2129 = "torch.aten.clamp"(%2126, %2127, %2128) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %2130 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2131 = "torch.prims.convert_element_type"(%2129, %2130) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %2132 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2133 = "torch.aten.unsqueeze"(%170, %2132) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %2134 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2135 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2136 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %2137 = "torch.prim.ListConstruct"(%2134, %2135, %2136) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2138 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2139 = "torch.aten.expand"(%2133, %2137, %2138) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %2140 = "torch_c.to_builtin_tensor"(%2131) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %2141 = "torch_c.to_builtin_tensor"(%2139) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %2142 = "util.call"(%2140, %2141) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %2143 = "torch_c.from_builtin_tensor"(%2142) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %2144 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2145 = "torch.prims.convert_element_type"(%2143, %2144) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2146 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2147 = "torch.aten.add.Tensor"(%2064, %2145, %2146) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2148 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %2149 = "torch.prims.convert_element_type"(%2147, %2148) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2150 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2151 = "torch.aten.pow.Tensor_Scalar"(%2149, %2150) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2152 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2153 = "torch.prim.ListConstruct"(%2152) : (!torch.int) -> !torch.list<int>
    %2154 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %2155 = "torch.constant.none"() : () -> !torch.none
    %2156 = "torch.aten.mean.dim"(%2151, %2153, %2154, %2155) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %2157 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %2158 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2159 = "torch.aten.add.Scalar"(%2156, %2157, %2158) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %2160 = "torch.aten.rsqrt"(%2159) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %2161 = "torch.aten.mul.Tensor"(%2149, %2160) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2162 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2163 = "torch.prims.convert_element_type"(%2161, %2162) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2164 = "torch.aten.mul.Tensor"(%172, %2163) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %2165 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2166 = "torch.prims.convert_element_type"(%2164, %2165) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2167 = "torch.aten.div.Tensor"(%2166, %174) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2168 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2169 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2170 = "torch.aten.clamp"(%2167, %2168, %2169) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2171 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2172 = "torch.prims.convert_element_type"(%2170, %2171) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2173 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2174 = "torch.aten.unsqueeze"(%176, %2173) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %2175 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2176 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2177 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2178 = "torch.prim.ListConstruct"(%2175, %2176, %2177) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2179 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2180 = "torch.aten.expand"(%2174, %2178, %2179) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2181 = "torch_c.to_builtin_tensor"(%2172) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2182 = "torch_c.to_builtin_tensor"(%2180) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2183 = "util.call"(%2181, %2182) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %2184 = "torch_c.from_builtin_tensor"(%2183) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %2185 = "torch.aten.div.Tensor"(%2184, %178) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2186 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2187 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2188 = "torch.aten.clamp"(%2185, %2186, %2187) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %2189 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2190 = "torch.prims.convert_element_type"(%2188, %2189) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2191 = "torch.aten.div.Tensor"(%2166, %180) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2192 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2193 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2194 = "torch.aten.clamp"(%2191, %2192, %2193) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2195 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2196 = "torch.prims.convert_element_type"(%2194, %2195) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2197 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2198 = "torch.aten.unsqueeze"(%182, %2197) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %2199 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2200 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %2201 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2202 = "torch.prim.ListConstruct"(%2199, %2200, %2201) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2203 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2204 = "torch.aten.expand"(%2198, %2202, %2203) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2205 = "torch_c.to_builtin_tensor"(%2196) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2206 = "torch_c.to_builtin_tensor"(%2204) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2207 = "util.call"(%2205, %2206) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %2208 = "torch_c.from_builtin_tensor"(%2207) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %2209 = "torch.aten.div.Tensor"(%2208, %184) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %2210 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2211 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2212 = "torch.aten.clamp"(%2209, %2210, %2211) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %2213 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2214 = "torch.prims.convert_element_type"(%2212, %2213) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %2215 = "torch.aten.div.Tensor"(%2166, %186) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2216 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2217 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2218 = "torch.aten.clamp"(%2215, %2216, %2217) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2219 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2220 = "torch.prims.convert_element_type"(%2218, %2219) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2221 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2222 = "torch.aten.unsqueeze"(%188, %2221) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %2223 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2224 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %2225 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2226 = "torch.prim.ListConstruct"(%2223, %2224, %2225) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2227 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2228 = "torch.aten.expand"(%2222, %2226, %2227) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2229 = "torch_c.to_builtin_tensor"(%2220) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2230 = "torch_c.to_builtin_tensor"(%2228) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2231 = "util.call"(%2229, %2230) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %2232 = "torch_c.from_builtin_tensor"(%2231) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %2233 = "torch.aten.div.Tensor"(%2232, %190) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %2234 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2235 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2236 = "torch.aten.clamp"(%2233, %2234, %2235) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %2237 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2238 = "torch.prims.convert_element_type"(%2236, %2237) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %2239 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2240 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2241 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2242 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2243 = "torch.prim.ListConstruct"(%2239, %2240, %2241, %2242) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2244 = "torch.aten.view"(%2190, %2243) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %2245 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2246 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2247 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2248 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2249 = "torch.prim.ListConstruct"(%2245, %2246, %2247, %2248) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2250 = "torch.aten.view"(%2214, %2249) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %2251 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2252 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2253 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2254 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2255 = "torch.prim.ListConstruct"(%2251, %2252, %2253, %2254) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2256 = "torch.aten.view"(%2238, %2255) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %2257 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2258 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2259 = "torch.aten.transpose.int"(%2244, %2257, %2258) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2260 = "torch.aten.mul.Tensor"(%2259, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2261 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2262 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2263 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2264 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2265 = "torch.aten.slice.Tensor"(%2259, %2261, %2262, %2263, %2264) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %2266 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2267 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2268 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2269 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2270 = "torch.aten.slice.Tensor"(%2259, %2266, %2267, %2268, %2269) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %2271 = "torch.aten.neg"(%2270) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %2272 = "torch.prim.ListConstruct"(%2271, %2265) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %2273 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2274 = "torch.aten.cat"(%2272, %2273) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2275 = "torch.aten.mul.Tensor"(%2274, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2276 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2277 = "torch.aten.add.Tensor"(%2260, %2275, %2276) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2278 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2279 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2280 = "torch.aten.transpose.int"(%2277, %2278, %2279) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %2281 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2282 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2283 = "torch.aten.transpose.int"(%2250, %2281, %2282) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2284 = "torch.aten.mul.Tensor"(%2283, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2285 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2286 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2287 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2288 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2289 = "torch.aten.slice.Tensor"(%2283, %2285, %2286, %2287, %2288) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %2290 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2291 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2292 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2293 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2294 = "torch.aten.slice.Tensor"(%2283, %2290, %2291, %2292, %2293) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %2295 = "torch.aten.neg"(%2294) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %2296 = "torch.prim.ListConstruct"(%2295, %2289) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %2297 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2298 = "torch.aten.cat"(%2296, %2297) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2299 = "torch.aten.mul.Tensor"(%2298, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2300 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2301 = "torch.aten.add.Tensor"(%2284, %2299, %2300) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2302 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2303 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2304 = "torch.aten.transpose.int"(%2301, %2302, %2303) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %2305 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2306 = "torch.aten.floor_divide.Scalar"(%arg64, %2305) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2307 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2308 = "torch.aten.unsqueeze"(%2306, %2307) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2309 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2310 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2311 = "torch.aten.gather"(%arg65, %2309, %2308, %2310) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2312 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2313 = "torch.aten.remainder.Scalar"(%arg64, %2312) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2314 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2315 = "torch.aten.unsqueeze"(%2313, %2314) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2316 = "torch.constant.none"() : () -> !torch.none
    %2317 = "torch.aten.clone"(%191, %2316) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %2318 = "torch.aten.detach"(%2317) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2319 = "torch.aten.detach"(%2318) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2320 = "torch.aten.detach"(%2319) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2321 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2322 = "torch.aten.unsqueeze"(%2320, %2321) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %2323 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2324 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2325 = "torch.prim.ListConstruct"(%2323, %2324) : (!torch.int, !torch.int) -> !torch.list<int>
    %2326 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2327 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2328 = "torch.prim.ListConstruct"(%2326, %2327) : (!torch.int, !torch.int) -> !torch.list<int>
    %2329 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2330 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2331 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %2332 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2333 = "torch.aten.empty_strided"(%2325, %2328, %2329, %2330, %2331, %2332) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2334 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2335 = "torch.aten.fill.Scalar"(%2333, %2334) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2336 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2337 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2338 = "torch.prim.ListConstruct"(%2336, %2337) : (!torch.int, !torch.int) -> !torch.list<int>
    %2339 = "torch.aten.repeat"(%2322, %2338) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %2340 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2341 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2342 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2343 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2344 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2345 = "torch.prim.ListConstruct"(%1483, %2340, %2341, %2342, %2343, %2344) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2346 = "torch.aten.view"(%1910, %2345) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2346, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2347 = "torch.prim.ListConstruct"(%2311, %2335, %2339, %2315) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %2348 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2349 = "torch.aten.index_put"(%2346, %2347, %2304, %2348) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2349, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2350 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %2351 = "torch.prim.ListConstruct"(%1483, %2350) : (!torch.int, !torch.int) -> !torch.list<int>
    %2352 = "torch.aten.view"(%2349, %2351) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2352, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %2353 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2354 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2355 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2356 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2357 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2358 = "torch.prim.ListConstruct"(%1483, %2353, %2354, %2355, %2356, %2357) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2359 = "torch.aten.view"(%2352, %2358) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2359, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2360 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2361 = "torch.aten.floor_divide.Scalar"(%arg64, %2360) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2362 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2363 = "torch.aten.unsqueeze"(%2361, %2362) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2364 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2365 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2366 = "torch.aten.gather"(%arg65, %2364, %2363, %2365) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2367 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2368 = "torch.aten.remainder.Scalar"(%arg64, %2367) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2369 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2370 = "torch.aten.unsqueeze"(%2368, %2369) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2371 = "torch.constant.none"() : () -> !torch.none
    %2372 = "torch.aten.clone"(%192, %2371) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %2373 = "torch.aten.detach"(%2372) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2374 = "torch.aten.detach"(%2373) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2375 = "torch.aten.detach"(%2374) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2376 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2377 = "torch.aten.unsqueeze"(%2375, %2376) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %2378 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2379 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2380 = "torch.prim.ListConstruct"(%2378, %2379) : (!torch.int, !torch.int) -> !torch.list<int>
    %2381 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2382 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2383 = "torch.prim.ListConstruct"(%2381, %2382) : (!torch.int, !torch.int) -> !torch.list<int>
    %2384 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2385 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2386 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %2387 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2388 = "torch.aten.empty_strided"(%2380, %2383, %2384, %2385, %2386, %2387) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2389 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2390 = "torch.aten.fill.Scalar"(%2388, %2389) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2391 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2392 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2393 = "torch.prim.ListConstruct"(%2391, %2392) : (!torch.int, !torch.int) -> !torch.list<int>
    %2394 = "torch.aten.repeat"(%2377, %2393) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %2395 = "torch.prim.ListConstruct"(%2366, %2390, %2394, %2370) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %2396 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2397 = "torch.aten.index_put"(%2359, %2395, %2256, %2396) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2397, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2398 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %2399 = "torch.prim.ListConstruct"(%1483, %2398) : (!torch.int, !torch.int) -> !torch.list<int>
    %2400 = "torch.aten.view"(%2397, %2399) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2400, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %2401 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2402 = "torch.aten.mul.Scalar"(%arg65, %2401) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2402, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2403 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2404 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2405 = "torch.aten.add.Scalar"(%2402, %2403, %2404) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2405, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2406 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2408 = "torch.aten.add.Scalar"(%2405, %2406, %2407) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2408, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2409 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %2410 = "torch.aten.view"(%2408, %2409) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%2410, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %2411 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2412 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2413 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2414 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2415 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2416 = "torch.prim.ListConstruct"(%1483, %2411, %2412, %2413, %2414, %2415) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2417 = "torch.aten.view"(%2400, %2416) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2417, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2418 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2419 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2420 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2421 = "torch.prim.ListConstruct"(%1914, %2418, %2419, %2420) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2422 = "torch.aten.view"(%2417, %2421) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2422, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2423 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2424 = "torch.aten.index_select"(%2422, %2423, %2410) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2424, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2425 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2426 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2427 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2428 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2429 = "torch.prim.ListConstruct"(%2425, %1481, %2426, %2427, %2428) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2430 = "torch.aten.view"(%2424, %2429) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2430, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2431 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2432 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2433 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2434 = "torch.prim.ListConstruct"(%2431, %1485, %2432, %2433) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2435 = "torch.aten.view"(%2430, %2434) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2435, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2437 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2438 = "torch.aten.add.Scalar"(%2405, %2436, %2437) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2438, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2439 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %2440 = "torch.aten.view"(%2438, %2439) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%2440, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %2441 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2442 = "torch.aten.index_select"(%2422, %2441, %2440) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2442, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2443 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2444 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2445 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2446 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2447 = "torch.prim.ListConstruct"(%2443, %1481, %2444, %2445, %2446) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2448 = "torch.aten.view"(%2442, %2447) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2448, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2449 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2450 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2451 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2452 = "torch.prim.ListConstruct"(%2449, %1485, %2450, %2451) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2453 = "torch.aten.view"(%2448, %2452) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2453, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2454 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2455 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2456 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2457 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2458 = "torch.aten.slice.Tensor"(%2435, %2454, %2455, %2456, %2457) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2458, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2459 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2460 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2461 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2462 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2463 = "torch.aten.slice.Tensor"(%2453, %2459, %2460, %2461, %2462) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2463, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2464 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %2465 = "torch.aten.unsqueeze"(%2458, %2464) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2465, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2466 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2467 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2468 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2469 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2470 = "torch.prim.ListConstruct"(%2466, %1485, %2467, %2468, %2469) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2471 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2472 = "torch.aten.expand"(%2465, %2470, %2471) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2472, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2473 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2474 = "torch.aten.clone"(%2472, %2473) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2474, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2475 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2476 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2477 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2478 = "torch.prim.ListConstruct"(%2475, %1485, %2476, %2477) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2479 = "torch.aten._unsafe_view"(%2474, %2478) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2479, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2480 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %2481 = "torch.aten.unsqueeze"(%2463, %2480) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2481, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2482 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2483 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2484 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2485 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2486 = "torch.prim.ListConstruct"(%2482, %1485, %2483, %2484, %2485) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2487 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2488 = "torch.aten.expand"(%2481, %2486, %2487) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2488, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2489 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2490 = "torch.aten.clone"(%2488, %2489) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2490, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2491 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2492 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2493 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2494 = "torch.prim.ListConstruct"(%2491, %1485, %2492, %2493) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2495 = "torch.aten._unsafe_view"(%2490, %2494) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2495, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2496 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2497 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2498 = "torch.aten.transpose.int"(%2280, %2496, %2497) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2499 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2500 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2501 = "torch.aten.transpose.int"(%2479, %2499, %2500) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2501, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2502 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2503 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2504 = "torch.aten.transpose.int"(%2495, %2502, %2503) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2504, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2505 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2506 = "torch.aten.squeeze.dim"(%1516, %2505) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2506, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %2507 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2508 = "torch.aten.squeeze.dim"(%2506, %2507) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2508, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %2509 = "torch_c.to_builtin_tensor"(%2498) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %2510 = "tensor.cast"(%2509) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2511 = "torch_c.to_builtin_tensor"(%2501) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2512 = "torch_c.to_builtin_tensor"(%2504) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2513 = "torch_c.to_builtin_tensor"(%2508) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %2514 = "tensor.cast"(%2513) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %2515 = "torch_c.to_builtin_tensor"(%194) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %2516 = "util.call"(%2510, %2511, %2512, %2515, %2514) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %2517 = "tensor.cast"(%2516) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %2518 = "torch_c.from_builtin_tensor"(%2517) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %2519 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2520 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2521 = "torch.aten.transpose.int"(%2518, %2519, %2520) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %2522 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2523 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2524 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2525 = "torch.prim.ListConstruct"(%2522, %2523, %2524) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2526 = "torch.aten.view"(%2521, %2525) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %2527 = "torch.aten.div.Tensor"(%2526, %196) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2528 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2529 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2530 = "torch.aten.clamp"(%2527, %2528, %2529) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %2531 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2532 = "torch.prims.convert_element_type"(%2530, %2531) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2533 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2534 = "torch.aten.unsqueeze"(%198, %2533) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %2535 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2536 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2537 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2538 = "torch.prim.ListConstruct"(%2535, %2536, %2537) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2539 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2540 = "torch.aten.expand"(%2534, %2538, %2539) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2541 = "torch_c.to_builtin_tensor"(%2532) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2542 = "torch_c.to_builtin_tensor"(%2540) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2543 = "util.call"(%2541, %2542) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %2544 = "torch_c.from_builtin_tensor"(%2543) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %2545 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2546 = "torch.prims.convert_element_type"(%2544, %2545) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2547 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2548 = "torch.aten.add.Tensor"(%2147, %2546, %2547) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2549 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %2550 = "torch.prims.convert_element_type"(%2548, %2549) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2551 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2552 = "torch.aten.pow.Tensor_Scalar"(%2550, %2551) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2553 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2554 = "torch.prim.ListConstruct"(%2553) : (!torch.int) -> !torch.list<int>
    %2555 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %2556 = "torch.constant.none"() : () -> !torch.none
    %2557 = "torch.aten.mean.dim"(%2552, %2554, %2555, %2556) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %2558 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %2559 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2560 = "torch.aten.add.Scalar"(%2557, %2558, %2559) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %2561 = "torch.aten.rsqrt"(%2560) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %2562 = "torch.aten.mul.Tensor"(%2550, %2561) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2563 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2564 = "torch.prims.convert_element_type"(%2562, %2563) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2565 = "torch.aten.mul.Tensor"(%200, %2564) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %2566 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2567 = "torch.prims.convert_element_type"(%2565, %2566) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2568 = "torch.aten.div.Tensor"(%2567, %202) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2569 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2570 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2571 = "torch.aten.clamp"(%2568, %2569, %2570) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2572 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2573 = "torch.prims.convert_element_type"(%2571, %2572) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2574 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2575 = "torch.aten.unsqueeze"(%204, %2574) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %2576 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2577 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %2578 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2579 = "torch.prim.ListConstruct"(%2576, %2577, %2578) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2580 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2581 = "torch.aten.expand"(%2575, %2579, %2580) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2582 = "torch_c.to_builtin_tensor"(%2573) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2583 = "torch_c.to_builtin_tensor"(%2581) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2584 = "util.call"(%2582, %2583) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %2585 = "torch_c.from_builtin_tensor"(%2584) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %2586 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2587 = "torch.prims.convert_element_type"(%2585, %2586) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %2588 = "torch.aten.silu"(%2587) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %2589 = "torch.aten.div.Tensor"(%2567, %206) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2590 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2591 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2592 = "torch.aten.clamp"(%2589, %2590, %2591) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2593 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2594 = "torch.prims.convert_element_type"(%2592, %2593) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2595 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2596 = "torch.aten.unsqueeze"(%208, %2595) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %2597 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2598 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %2599 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2600 = "torch.prim.ListConstruct"(%2597, %2598, %2599) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2601 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2602 = "torch.aten.expand"(%2596, %2600, %2601) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2603 = "torch_c.to_builtin_tensor"(%2594) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2604 = "torch_c.to_builtin_tensor"(%2602) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2605 = "util.call"(%2603, %2604) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %2606 = "torch_c.from_builtin_tensor"(%2605) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %2607 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2608 = "torch.prims.convert_element_type"(%2606, %2607) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %2609 = "torch.aten.mul.Tensor"(%2588, %2608) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %2610 = "torch.aten.div.Tensor"(%2609, %210) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %2611 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2612 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2613 = "torch.aten.clamp"(%2610, %2611, %2612) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %2614 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2615 = "torch.prims.convert_element_type"(%2613, %2614) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %2616 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2617 = "torch.aten.unsqueeze"(%212, %2616) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %2618 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2619 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2620 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %2621 = "torch.prim.ListConstruct"(%2618, %2619, %2620) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2622 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2623 = "torch.aten.expand"(%2617, %2621, %2622) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %2624 = "torch_c.to_builtin_tensor"(%2615) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %2625 = "torch_c.to_builtin_tensor"(%2623) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %2626 = "util.call"(%2624, %2625) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %2627 = "torch_c.from_builtin_tensor"(%2626) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %2628 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2629 = "torch.prims.convert_element_type"(%2627, %2628) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2630 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2631 = "torch.aten.add.Tensor"(%2548, %2629, %2630) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2632 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %2633 = "torch.prims.convert_element_type"(%2631, %2632) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2634 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2635 = "torch.aten.pow.Tensor_Scalar"(%2633, %2634) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %2636 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2637 = "torch.prim.ListConstruct"(%2636) : (!torch.int) -> !torch.list<int>
    %2638 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %2639 = "torch.constant.none"() : () -> !torch.none
    %2640 = "torch.aten.mean.dim"(%2635, %2637, %2638, %2639) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %2641 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %2642 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2643 = "torch.aten.add.Scalar"(%2640, %2641, %2642) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %2644 = "torch.aten.rsqrt"(%2643) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %2645 = "torch.aten.mul.Tensor"(%2633, %2644) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2646 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2647 = "torch.prims.convert_element_type"(%2645, %2646) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2648 = "torch.aten.mul.Tensor"(%214, %2647) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %2649 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %2650 = "torch.prims.convert_element_type"(%2648, %2649) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %2651 = "torch.aten.div.Tensor"(%2650, %216) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2652 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2653 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2654 = "torch.aten.clamp"(%2651, %2652, %2653) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2655 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2656 = "torch.prims.convert_element_type"(%2654, %2655) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2657 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2658 = "torch.aten.unsqueeze"(%218, %2657) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %2659 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2660 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2661 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2662 = "torch.prim.ListConstruct"(%2659, %2660, %2661) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2663 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2664 = "torch.aten.expand"(%2658, %2662, %2663) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2665 = "torch_c.to_builtin_tensor"(%2656) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2666 = "torch_c.to_builtin_tensor"(%2664) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2667 = "util.call"(%2665, %2666) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %2668 = "torch_c.from_builtin_tensor"(%2667) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %2669 = "torch.aten.div.Tensor"(%2668, %220) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %2670 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2671 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2672 = "torch.aten.clamp"(%2669, %2670, %2671) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %2673 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2674 = "torch.prims.convert_element_type"(%2672, %2673) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2675 = "torch.aten.div.Tensor"(%2650, %222) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2676 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2677 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2678 = "torch.aten.clamp"(%2675, %2676, %2677) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2679 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2680 = "torch.prims.convert_element_type"(%2678, %2679) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2681 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2682 = "torch.aten.unsqueeze"(%224, %2681) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %2683 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2684 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %2685 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2686 = "torch.prim.ListConstruct"(%2683, %2684, %2685) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2687 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2688 = "torch.aten.expand"(%2682, %2686, %2687) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2689 = "torch_c.to_builtin_tensor"(%2680) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2690 = "torch_c.to_builtin_tensor"(%2688) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2691 = "util.call"(%2689, %2690) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %2692 = "torch_c.from_builtin_tensor"(%2691) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %2693 = "torch.aten.div.Tensor"(%2692, %226) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %2694 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2695 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2696 = "torch.aten.clamp"(%2693, %2694, %2695) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %2697 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2698 = "torch.prims.convert_element_type"(%2696, %2697) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %2699 = "torch.aten.div.Tensor"(%2650, %228) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %2700 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2701 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2702 = "torch.aten.clamp"(%2699, %2700, %2701) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %2703 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2704 = "torch.prims.convert_element_type"(%2702, %2703) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %2705 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2706 = "torch.aten.unsqueeze"(%230, %2705) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %2707 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2708 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %2709 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %2710 = "torch.prim.ListConstruct"(%2707, %2708, %2709) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2711 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2712 = "torch.aten.expand"(%2706, %2710, %2711) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2713 = "torch_c.to_builtin_tensor"(%2704) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %2714 = "torch_c.to_builtin_tensor"(%2712) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2715 = "util.call"(%2713, %2714) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %2716 = "torch_c.from_builtin_tensor"(%2715) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %2717 = "torch.aten.div.Tensor"(%2716, %232) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %2718 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %2719 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %2720 = "torch.aten.clamp"(%2717, %2718, %2719) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %2721 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %2722 = "torch.prims.convert_element_type"(%2720, %2721) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %2723 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2724 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2725 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2726 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2727 = "torch.prim.ListConstruct"(%2723, %2724, %2725, %2726) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2728 = "torch.aten.view"(%2674, %2727) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %2729 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2730 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2731 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2732 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2733 = "torch.prim.ListConstruct"(%2729, %2730, %2731, %2732) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2734 = "torch.aten.view"(%2698, %2733) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %2735 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2736 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2737 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2738 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2739 = "torch.prim.ListConstruct"(%2735, %2736, %2737, %2738) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2740 = "torch.aten.view"(%2722, %2739) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %2741 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2742 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2743 = "torch.aten.transpose.int"(%2728, %2741, %2742) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2744 = "torch.aten.mul.Tensor"(%2743, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2745 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2746 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2747 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2748 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2749 = "torch.aten.slice.Tensor"(%2743, %2745, %2746, %2747, %2748) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %2750 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2751 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2752 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2753 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2754 = "torch.aten.slice.Tensor"(%2743, %2750, %2751, %2752, %2753) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %2755 = "torch.aten.neg"(%2754) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %2756 = "torch.prim.ListConstruct"(%2755, %2749) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %2757 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2758 = "torch.aten.cat"(%2756, %2757) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2759 = "torch.aten.mul.Tensor"(%2758, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2761 = "torch.aten.add.Tensor"(%2744, %2759, %2760) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2763 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2764 = "torch.aten.transpose.int"(%2761, %2762, %2763) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %2765 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2766 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2767 = "torch.aten.transpose.int"(%2734, %2765, %2766) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2768 = "torch.aten.mul.Tensor"(%2767, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2769 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2770 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2771 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2772 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2773 = "torch.aten.slice.Tensor"(%2767, %2769, %2770, %2771, %2772) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %2774 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %2775 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2776 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2778 = "torch.aten.slice.Tensor"(%2767, %2774, %2775, %2776, %2777) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %2779 = "torch.aten.neg"(%2778) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %2780 = "torch.prim.ListConstruct"(%2779, %2773) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %2781 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %2782 = "torch.aten.cat"(%2780, %2781) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2783 = "torch.aten.mul.Tensor"(%2782, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2784 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2785 = "torch.aten.add.Tensor"(%2768, %2783, %2784) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %2786 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2787 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2788 = "torch.aten.transpose.int"(%2785, %2786, %2787) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %2789 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2790 = "torch.aten.floor_divide.Scalar"(%arg64, %2789) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2792 = "torch.aten.unsqueeze"(%2790, %2791) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2793 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2794 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2795 = "torch.aten.gather"(%arg65, %2793, %2792, %2794) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2796 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2797 = "torch.aten.remainder.Scalar"(%arg64, %2796) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2798 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2799 = "torch.aten.unsqueeze"(%2797, %2798) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2800 = "torch.constant.none"() : () -> !torch.none
    %2801 = "torch.aten.clone"(%233, %2800) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %2802 = "torch.aten.detach"(%2801) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2803 = "torch.aten.detach"(%2802) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2804 = "torch.aten.detach"(%2803) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2805 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2806 = "torch.aten.unsqueeze"(%2804, %2805) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %2807 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2808 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2809 = "torch.prim.ListConstruct"(%2807, %2808) : (!torch.int, !torch.int) -> !torch.list<int>
    %2810 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2811 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2812 = "torch.prim.ListConstruct"(%2810, %2811) : (!torch.int, !torch.int) -> !torch.list<int>
    %2813 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2814 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2815 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %2816 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2817 = "torch.aten.empty_strided"(%2809, %2812, %2813, %2814, %2815, %2816) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2818 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2819 = "torch.aten.fill.Scalar"(%2817, %2818) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2820 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2821 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2822 = "torch.prim.ListConstruct"(%2820, %2821) : (!torch.int, !torch.int) -> !torch.list<int>
    %2823 = "torch.aten.repeat"(%2806, %2822) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %2824 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2825 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2826 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2827 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2828 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2829 = "torch.prim.ListConstruct"(%1483, %2824, %2825, %2826, %2827, %2828) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2830 = "torch.aten.view"(%2400, %2829) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2830, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2831 = "torch.prim.ListConstruct"(%2795, %2819, %2823, %2799) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %2832 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2833 = "torch.aten.index_put"(%2830, %2831, %2788, %2832) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2833, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2834 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %2835 = "torch.prim.ListConstruct"(%1483, %2834) : (!torch.int, !torch.int) -> !torch.list<int>
    %2836 = "torch.aten.view"(%2833, %2835) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2836, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %2837 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2838 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2839 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2840 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2841 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2842 = "torch.prim.ListConstruct"(%1483, %2837, %2838, %2839, %2840, %2841) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2843 = "torch.aten.view"(%2836, %2842) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2843, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2844 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2845 = "torch.aten.floor_divide.Scalar"(%arg64, %2844) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2846 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2847 = "torch.aten.unsqueeze"(%2845, %2846) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2848 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2849 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2850 = "torch.aten.gather"(%arg65, %2848, %2847, %2849) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2851 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2852 = "torch.aten.remainder.Scalar"(%arg64, %2851) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %2853 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2854 = "torch.aten.unsqueeze"(%2852, %2853) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2855 = "torch.constant.none"() : () -> !torch.none
    %2856 = "torch.aten.clone"(%234, %2855) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %2857 = "torch.aten.detach"(%2856) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2858 = "torch.aten.detach"(%2857) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2859 = "torch.aten.detach"(%2858) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %2860 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2861 = "torch.aten.unsqueeze"(%2859, %2860) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %2862 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2863 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2864 = "torch.prim.ListConstruct"(%2862, %2863) : (!torch.int, !torch.int) -> !torch.list<int>
    %2865 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2867 = "torch.prim.ListConstruct"(%2865, %2866) : (!torch.int, !torch.int) -> !torch.list<int>
    %2868 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2869 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2870 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %2871 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2872 = "torch.aten.empty_strided"(%2864, %2867, %2868, %2869, %2870, %2871) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %2873 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2874 = "torch.aten.fill.Scalar"(%2872, %2873) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %2875 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2876 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2877 = "torch.prim.ListConstruct"(%2875, %2876) : (!torch.int, !torch.int) -> !torch.list<int>
    %2878 = "torch.aten.repeat"(%2861, %2877) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %2879 = "torch.prim.ListConstruct"(%2850, %2874, %2878, %2854) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %2880 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2881 = "torch.aten.index_put"(%2843, %2879, %2740, %2880) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2881, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2882 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %2883 = "torch.prim.ListConstruct"(%1483, %2882) : (!torch.int, !torch.int) -> !torch.list<int>
    %2884 = "torch.aten.view"(%2881, %2883) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2884, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %2885 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %2886 = "torch.aten.mul.Scalar"(%arg65, %2885) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2886, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2887 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2889 = "torch.aten.add.Scalar"(%2886, %2887, %2888) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2889, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2890 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2891 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2892 = "torch.aten.add.Scalar"(%2889, %2890, %2891) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2892, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2893 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %2894 = "torch.aten.view"(%2892, %2893) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%2894, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %2895 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2896 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2897 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2898 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2899 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2900 = "torch.prim.ListConstruct"(%1483, %2895, %2896, %2897, %2898, %2899) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2901 = "torch.aten.view"(%2884, %2900) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2901, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2902 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2903 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2904 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2905 = "torch.prim.ListConstruct"(%1914, %2902, %2903, %2904) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2906 = "torch.aten.view"(%2901, %2905) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2906, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2907 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2908 = "torch.aten.index_select"(%2906, %2907, %2894) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2908, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2909 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2910 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2911 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2912 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2913 = "torch.prim.ListConstruct"(%2909, %1481, %2910, %2911, %2912) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2914 = "torch.aten.view"(%2908, %2913) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2914, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2915 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2916 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2917 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2918 = "torch.prim.ListConstruct"(%2915, %1485, %2916, %2917) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2919 = "torch.aten.view"(%2914, %2918) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2919, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2920 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2921 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2922 = "torch.aten.add.Scalar"(%2889, %2920, %2921) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%2922, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %2923 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %2924 = "torch.aten.view"(%2922, %2923) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%2924, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %2925 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2926 = "torch.aten.index_select"(%2906, %2925, %2924) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2926, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2927 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2928 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2929 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2930 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2931 = "torch.prim.ListConstruct"(%2927, %1481, %2928, %2929, %2930) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2932 = "torch.aten.view"(%2926, %2931) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2932, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2933 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2934 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2935 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2936 = "torch.prim.ListConstruct"(%2933, %1485, %2934, %2935) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2937 = "torch.aten.view"(%2932, %2936) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2937, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2938 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2939 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2940 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2941 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2942 = "torch.aten.slice.Tensor"(%2919, %2938, %2939, %2940, %2941) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2942, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2943 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2944 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2945 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %2946 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2947 = "torch.aten.slice.Tensor"(%2937, %2943, %2944, %2945, %2946) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2947, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2948 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %2949 = "torch.aten.unsqueeze"(%2942, %2948) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2949, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2950 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2951 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2952 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2953 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2954 = "torch.prim.ListConstruct"(%2950, %1485, %2951, %2952, %2953) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2955 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2956 = "torch.aten.expand"(%2949, %2954, %2955) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2956, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2957 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2958 = "torch.aten.clone"(%2956, %2957) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2958, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2960 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2961 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2962 = "torch.prim.ListConstruct"(%2959, %1485, %2960, %2961) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2963 = "torch.aten._unsafe_view"(%2958, %2962) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2963, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2964 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %2965 = "torch.aten.unsqueeze"(%2947, %2964) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2965, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2966 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2967 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %2968 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2969 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2970 = "torch.prim.ListConstruct"(%2966, %1485, %2967, %2968, %2969) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2971 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %2972 = "torch.aten.expand"(%2965, %2970, %2971) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2972, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2973 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2974 = "torch.aten.clone"(%2972, %2973) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2974, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2975 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %2976 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %2977 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %2978 = "torch.prim.ListConstruct"(%2975, %1485, %2976, %2977) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2979 = "torch.aten._unsafe_view"(%2974, %2978) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2979, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2980 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2981 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2982 = "torch.aten.transpose.int"(%2764, %2980, %2981) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %2983 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2984 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2985 = "torch.aten.transpose.int"(%2963, %2983, %2984) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2985, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2986 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %2987 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %2988 = "torch.aten.transpose.int"(%2979, %2986, %2987) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2988, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %2989 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2990 = "torch.aten.squeeze.dim"(%1516, %2989) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2990, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %2991 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %2992 = "torch.aten.squeeze.dim"(%2990, %2991) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%2992, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %2993 = "torch_c.to_builtin_tensor"(%2982) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %2994 = "tensor.cast"(%2993) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2995 = "torch_c.to_builtin_tensor"(%2985) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2996 = "torch_c.to_builtin_tensor"(%2988) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2997 = "torch_c.to_builtin_tensor"(%2992) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %2998 = "tensor.cast"(%2997) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %2999 = "torch_c.to_builtin_tensor"(%236) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %3000 = "util.call"(%2994, %2995, %2996, %2999, %2998) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %3001 = "tensor.cast"(%3000) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %3002 = "torch_c.from_builtin_tensor"(%3001) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %3003 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3004 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3005 = "torch.aten.transpose.int"(%3002, %3003, %3004) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %3006 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3007 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3008 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3009 = "torch.prim.ListConstruct"(%3006, %3007, %3008) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3010 = "torch.aten.view"(%3005, %3009) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %3011 = "torch.aten.div.Tensor"(%3010, %238) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3012 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3013 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3014 = "torch.aten.clamp"(%3011, %3012, %3013) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %3015 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3016 = "torch.prims.convert_element_type"(%3014, %3015) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3017 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3018 = "torch.aten.unsqueeze"(%240, %3017) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %3019 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3020 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3021 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3022 = "torch.prim.ListConstruct"(%3019, %3020, %3021) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3023 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3024 = "torch.aten.expand"(%3018, %3022, %3023) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %3025 = "torch_c.to_builtin_tensor"(%3016) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3026 = "torch_c.to_builtin_tensor"(%3024) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %3027 = "util.call"(%3025, %3026) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %3028 = "torch_c.from_builtin_tensor"(%3027) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %3029 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3030 = "torch.prims.convert_element_type"(%3028, %3029) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3031 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3032 = "torch.aten.add.Tensor"(%2631, %3030, %3031) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3033 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %3034 = "torch.prims.convert_element_type"(%3032, %3033) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3035 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3036 = "torch.aten.pow.Tensor_Scalar"(%3034, %3035) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3037 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3038 = "torch.prim.ListConstruct"(%3037) : (!torch.int) -> !torch.list<int>
    %3039 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %3040 = "torch.constant.none"() : () -> !torch.none
    %3041 = "torch.aten.mean.dim"(%3036, %3038, %3039, %3040) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %3042 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %3043 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3044 = "torch.aten.add.Scalar"(%3041, %3042, %3043) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %3045 = "torch.aten.rsqrt"(%3044) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %3046 = "torch.aten.mul.Tensor"(%3034, %3045) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3047 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3048 = "torch.prims.convert_element_type"(%3046, %3047) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3049 = "torch.aten.mul.Tensor"(%242, %3048) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %3050 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3051 = "torch.prims.convert_element_type"(%3049, %3050) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3052 = "torch.aten.div.Tensor"(%3051, %244) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3053 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3054 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3055 = "torch.aten.clamp"(%3052, %3053, %3054) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3056 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3057 = "torch.prims.convert_element_type"(%3055, %3056) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3058 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3059 = "torch.aten.unsqueeze"(%246, %3058) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %3060 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3061 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %3062 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3063 = "torch.prim.ListConstruct"(%3060, %3061, %3062) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3064 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3065 = "torch.aten.expand"(%3059, %3063, %3064) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %3066 = "torch_c.to_builtin_tensor"(%3057) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3067 = "torch_c.to_builtin_tensor"(%3065) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %3068 = "util.call"(%3066, %3067) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %3069 = "torch_c.from_builtin_tensor"(%3068) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %3070 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3071 = "torch.prims.convert_element_type"(%3069, %3070) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %3072 = "torch.aten.silu"(%3071) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %3073 = "torch.aten.div.Tensor"(%3051, %248) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3074 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3075 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3076 = "torch.aten.clamp"(%3073, %3074, %3075) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3077 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3078 = "torch.prims.convert_element_type"(%3076, %3077) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3079 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3080 = "torch.aten.unsqueeze"(%250, %3079) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %3081 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3082 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %3083 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3084 = "torch.prim.ListConstruct"(%3081, %3082, %3083) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3085 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3086 = "torch.aten.expand"(%3080, %3084, %3085) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %3087 = "torch_c.to_builtin_tensor"(%3078) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3088 = "torch_c.to_builtin_tensor"(%3086) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %3089 = "util.call"(%3087, %3088) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %3090 = "torch_c.from_builtin_tensor"(%3089) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %3091 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3092 = "torch.prims.convert_element_type"(%3090, %3091) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %3093 = "torch.aten.mul.Tensor"(%3072, %3092) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %3094 = "torch.aten.div.Tensor"(%3093, %252) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %3095 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3096 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3097 = "torch.aten.clamp"(%3094, %3095, %3096) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %3098 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3099 = "torch.prims.convert_element_type"(%3097, %3098) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %3100 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3101 = "torch.aten.unsqueeze"(%254, %3100) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %3102 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3103 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3104 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %3105 = "torch.prim.ListConstruct"(%3102, %3103, %3104) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3106 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3107 = "torch.aten.expand"(%3101, %3105, %3106) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %3108 = "torch_c.to_builtin_tensor"(%3099) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %3109 = "torch_c.to_builtin_tensor"(%3107) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %3110 = "util.call"(%3108, %3109) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %3111 = "torch_c.from_builtin_tensor"(%3110) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %3112 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3113 = "torch.prims.convert_element_type"(%3111, %3112) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3114 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3115 = "torch.aten.add.Tensor"(%3032, %3113, %3114) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3116 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %3117 = "torch.prims.convert_element_type"(%3115, %3116) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3118 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3119 = "torch.aten.pow.Tensor_Scalar"(%3117, %3118) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3120 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3121 = "torch.prim.ListConstruct"(%3120) : (!torch.int) -> !torch.list<int>
    %3122 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %3123 = "torch.constant.none"() : () -> !torch.none
    %3124 = "torch.aten.mean.dim"(%3119, %3121, %3122, %3123) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %3125 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %3126 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3127 = "torch.aten.add.Scalar"(%3124, %3125, %3126) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %3128 = "torch.aten.rsqrt"(%3127) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %3129 = "torch.aten.mul.Tensor"(%3117, %3128) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3130 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3131 = "torch.prims.convert_element_type"(%3129, %3130) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3132 = "torch.aten.mul.Tensor"(%256, %3131) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %3133 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3134 = "torch.prims.convert_element_type"(%3132, %3133) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3135 = "torch.aten.div.Tensor"(%3134, %258) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3136 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3137 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3138 = "torch.aten.clamp"(%3135, %3136, %3137) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3139 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3140 = "torch.prims.convert_element_type"(%3138, %3139) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3141 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3142 = "torch.aten.unsqueeze"(%260, %3141) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %3143 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3144 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3145 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3146 = "torch.prim.ListConstruct"(%3143, %3144, %3145) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3147 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3148 = "torch.aten.expand"(%3142, %3146, %3147) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %3149 = "torch_c.to_builtin_tensor"(%3140) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3150 = "torch_c.to_builtin_tensor"(%3148) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %3151 = "util.call"(%3149, %3150) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %3152 = "torch_c.from_builtin_tensor"(%3151) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %3153 = "torch.aten.div.Tensor"(%3152, %262) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3154 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3155 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3156 = "torch.aten.clamp"(%3153, %3154, %3155) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %3157 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3158 = "torch.prims.convert_element_type"(%3156, %3157) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3159 = "torch.aten.div.Tensor"(%3134, %264) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3160 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3161 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3162 = "torch.aten.clamp"(%3159, %3160, %3161) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3163 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3164 = "torch.prims.convert_element_type"(%3162, %3163) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3165 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3166 = "torch.aten.unsqueeze"(%266, %3165) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %3167 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3168 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %3169 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3170 = "torch.prim.ListConstruct"(%3167, %3168, %3169) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3171 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3172 = "torch.aten.expand"(%3166, %3170, %3171) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %3173 = "torch_c.to_builtin_tensor"(%3164) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3174 = "torch_c.to_builtin_tensor"(%3172) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %3175 = "util.call"(%3173, %3174) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %3176 = "torch_c.from_builtin_tensor"(%3175) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %3177 = "torch.aten.div.Tensor"(%3176, %268) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %3178 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3179 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3180 = "torch.aten.clamp"(%3177, %3178, %3179) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %3181 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3182 = "torch.prims.convert_element_type"(%3180, %3181) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %3183 = "torch.aten.div.Tensor"(%3134, %270) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3184 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3185 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3186 = "torch.aten.clamp"(%3183, %3184, %3185) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3187 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3188 = "torch.prims.convert_element_type"(%3186, %3187) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3189 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3190 = "torch.aten.unsqueeze"(%272, %3189) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %3191 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3192 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %3193 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3194 = "torch.prim.ListConstruct"(%3191, %3192, %3193) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3195 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3196 = "torch.aten.expand"(%3190, %3194, %3195) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %3197 = "torch_c.to_builtin_tensor"(%3188) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3198 = "torch_c.to_builtin_tensor"(%3196) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %3199 = "util.call"(%3197, %3198) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %3200 = "torch_c.from_builtin_tensor"(%3199) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %3201 = "torch.aten.div.Tensor"(%3200, %274) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %3202 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3203 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3204 = "torch.aten.clamp"(%3201, %3202, %3203) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %3205 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3206 = "torch.prims.convert_element_type"(%3204, %3205) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %3207 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3208 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3209 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3210 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3211 = "torch.prim.ListConstruct"(%3207, %3208, %3209, %3210) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3212 = "torch.aten.view"(%3158, %3211) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %3213 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3215 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3216 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3217 = "torch.prim.ListConstruct"(%3213, %3214, %3215, %3216) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3218 = "torch.aten.view"(%3182, %3217) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %3219 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3220 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3221 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3222 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3223 = "torch.prim.ListConstruct"(%3219, %3220, %3221, %3222) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3224 = "torch.aten.view"(%3206, %3223) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %3225 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3226 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3227 = "torch.aten.transpose.int"(%3212, %3225, %3226) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3228 = "torch.aten.mul.Tensor"(%3227, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3229 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3230 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3231 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3232 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3233 = "torch.aten.slice.Tensor"(%3227, %3229, %3230, %3231, %3232) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %3234 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3235 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3236 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3237 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3238 = "torch.aten.slice.Tensor"(%3227, %3234, %3235, %3236, %3237) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %3239 = "torch.aten.neg"(%3238) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %3240 = "torch.prim.ListConstruct"(%3239, %3233) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %3241 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3242 = "torch.aten.cat"(%3240, %3241) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3243 = "torch.aten.mul.Tensor"(%3242, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3244 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3245 = "torch.aten.add.Tensor"(%3228, %3243, %3244) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3246 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3247 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3248 = "torch.aten.transpose.int"(%3245, %3246, %3247) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %3249 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3250 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3251 = "torch.aten.transpose.int"(%3218, %3249, %3250) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3252 = "torch.aten.mul.Tensor"(%3251, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3253 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3254 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3255 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3256 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3257 = "torch.aten.slice.Tensor"(%3251, %3253, %3254, %3255, %3256) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %3258 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3259 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3260 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3261 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3262 = "torch.aten.slice.Tensor"(%3251, %3258, %3259, %3260, %3261) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %3263 = "torch.aten.neg"(%3262) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %3264 = "torch.prim.ListConstruct"(%3263, %3257) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %3265 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3266 = "torch.aten.cat"(%3264, %3265) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3267 = "torch.aten.mul.Tensor"(%3266, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3268 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3269 = "torch.aten.add.Tensor"(%3252, %3267, %3268) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3271 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3272 = "torch.aten.transpose.int"(%3269, %3270, %3271) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %3273 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3274 = "torch.aten.floor_divide.Scalar"(%arg64, %3273) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3275 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3276 = "torch.aten.unsqueeze"(%3274, %3275) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3278 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3279 = "torch.aten.gather"(%arg65, %3277, %3276, %3278) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3280 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3281 = "torch.aten.remainder.Scalar"(%arg64, %3280) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3282 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3283 = "torch.aten.unsqueeze"(%3281, %3282) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3284 = "torch.constant.none"() : () -> !torch.none
    %3285 = "torch.aten.clone"(%275, %3284) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %3286 = "torch.aten.detach"(%3285) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3287 = "torch.aten.detach"(%3286) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3288 = "torch.aten.detach"(%3287) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3289 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3290 = "torch.aten.unsqueeze"(%3288, %3289) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %3291 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3292 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3293 = "torch.prim.ListConstruct"(%3291, %3292) : (!torch.int, !torch.int) -> !torch.list<int>
    %3294 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3295 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3296 = "torch.prim.ListConstruct"(%3294, %3295) : (!torch.int, !torch.int) -> !torch.list<int>
    %3297 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3298 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3299 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %3300 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3301 = "torch.aten.empty_strided"(%3293, %3296, %3297, %3298, %3299, %3300) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3302 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3303 = "torch.aten.fill.Scalar"(%3301, %3302) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3304 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3305 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3306 = "torch.prim.ListConstruct"(%3304, %3305) : (!torch.int, !torch.int) -> !torch.list<int>
    %3307 = "torch.aten.repeat"(%3290, %3306) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %3308 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3309 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3310 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3311 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3312 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3313 = "torch.prim.ListConstruct"(%1483, %3308, %3309, %3310, %3311, %3312) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3314 = "torch.aten.view"(%2884, %3313) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3314, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3315 = "torch.prim.ListConstruct"(%3279, %3303, %3307, %3283) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %3316 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3317 = "torch.aten.index_put"(%3314, %3315, %3272, %3316) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3317, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3318 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %3319 = "torch.prim.ListConstruct"(%1483, %3318) : (!torch.int, !torch.int) -> !torch.list<int>
    %3320 = "torch.aten.view"(%3317, %3319) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3320, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %3321 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3322 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3323 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3324 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3325 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3326 = "torch.prim.ListConstruct"(%1483, %3321, %3322, %3323, %3324, %3325) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3327 = "torch.aten.view"(%3320, %3326) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3327, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3328 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3329 = "torch.aten.floor_divide.Scalar"(%arg64, %3328) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3330 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3331 = "torch.aten.unsqueeze"(%3329, %3330) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3332 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3333 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3334 = "torch.aten.gather"(%arg65, %3332, %3331, %3333) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3335 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3336 = "torch.aten.remainder.Scalar"(%arg64, %3335) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3337 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3338 = "torch.aten.unsqueeze"(%3336, %3337) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3339 = "torch.constant.none"() : () -> !torch.none
    %3340 = "torch.aten.clone"(%276, %3339) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %3341 = "torch.aten.detach"(%3340) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3342 = "torch.aten.detach"(%3341) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3343 = "torch.aten.detach"(%3342) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3344 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3345 = "torch.aten.unsqueeze"(%3343, %3344) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %3346 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3347 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3348 = "torch.prim.ListConstruct"(%3346, %3347) : (!torch.int, !torch.int) -> !torch.list<int>
    %3349 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3350 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3351 = "torch.prim.ListConstruct"(%3349, %3350) : (!torch.int, !torch.int) -> !torch.list<int>
    %3352 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3353 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3354 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %3355 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3356 = "torch.aten.empty_strided"(%3348, %3351, %3352, %3353, %3354, %3355) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3357 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3358 = "torch.aten.fill.Scalar"(%3356, %3357) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3359 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3360 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3361 = "torch.prim.ListConstruct"(%3359, %3360) : (!torch.int, !torch.int) -> !torch.list<int>
    %3362 = "torch.aten.repeat"(%3345, %3361) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %3363 = "torch.prim.ListConstruct"(%3334, %3358, %3362, %3338) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %3364 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3365 = "torch.aten.index_put"(%3327, %3363, %3224, %3364) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3365, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3366 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %3367 = "torch.prim.ListConstruct"(%1483, %3366) : (!torch.int, !torch.int) -> !torch.list<int>
    %3368 = "torch.aten.view"(%3365, %3367) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3368, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %3369 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3370 = "torch.aten.mul.Scalar"(%arg65, %3369) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3370, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3371 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %3372 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3373 = "torch.aten.add.Scalar"(%3370, %3371, %3372) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3373, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3374 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3375 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3376 = "torch.aten.add.Scalar"(%3373, %3374, %3375) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3376, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3377 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %3378 = "torch.aten.view"(%3376, %3377) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%3378, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %3379 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3380 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3381 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3382 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3383 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3384 = "torch.prim.ListConstruct"(%1483, %3379, %3380, %3381, %3382, %3383) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3385 = "torch.aten.view"(%3368, %3384) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3385, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3386 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3387 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3388 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3389 = "torch.prim.ListConstruct"(%1914, %3386, %3387, %3388) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3390 = "torch.aten.view"(%3385, %3389) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3390, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3391 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3392 = "torch.aten.index_select"(%3390, %3391, %3378) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3392, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3393 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3394 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3395 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3396 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3397 = "torch.prim.ListConstruct"(%3393, %1481, %3394, %3395, %3396) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3398 = "torch.aten.view"(%3392, %3397) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3398, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3399 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3400 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3401 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3402 = "torch.prim.ListConstruct"(%3399, %1485, %3400, %3401) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3403 = "torch.aten.view"(%3398, %3402) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3403, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3404 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3405 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3406 = "torch.aten.add.Scalar"(%3373, %3404, %3405) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3406, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3407 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %3408 = "torch.aten.view"(%3406, %3407) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%3408, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %3409 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3410 = "torch.aten.index_select"(%3390, %3409, %3408) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3410, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3411 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3412 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3413 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3414 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3415 = "torch.prim.ListConstruct"(%3411, %1481, %3412, %3413, %3414) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3416 = "torch.aten.view"(%3410, %3415) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3416, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3417 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3418 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3419 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3420 = "torch.prim.ListConstruct"(%3417, %1485, %3418, %3419) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3421 = "torch.aten.view"(%3416, %3420) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3421, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3422 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3423 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3424 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3425 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3426 = "torch.aten.slice.Tensor"(%3403, %3422, %3423, %3424, %3425) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3426, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3427 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3428 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3429 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3430 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3431 = "torch.aten.slice.Tensor"(%3421, %3427, %3428, %3429, %3430) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3431, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3432 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %3433 = "torch.aten.unsqueeze"(%3426, %3432) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3433, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3434 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3435 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3436 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3437 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3438 = "torch.prim.ListConstruct"(%3434, %1485, %3435, %3436, %3437) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3439 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3440 = "torch.aten.expand"(%3433, %3438, %3439) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3440, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3441 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3442 = "torch.aten.clone"(%3440, %3441) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3442, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3443 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3444 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3445 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3446 = "torch.prim.ListConstruct"(%3443, %1485, %3444, %3445) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3447 = "torch.aten._unsafe_view"(%3442, %3446) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3447, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3448 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %3449 = "torch.aten.unsqueeze"(%3431, %3448) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3449, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3450 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3451 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3452 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3453 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3454 = "torch.prim.ListConstruct"(%3450, %1485, %3451, %3452, %3453) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3455 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3456 = "torch.aten.expand"(%3449, %3454, %3455) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3456, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3457 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3458 = "torch.aten.clone"(%3456, %3457) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3458, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3459 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3460 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3461 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3462 = "torch.prim.ListConstruct"(%3459, %1485, %3460, %3461) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3463 = "torch.aten._unsafe_view"(%3458, %3462) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3463, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3464 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3465 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3466 = "torch.aten.transpose.int"(%3248, %3464, %3465) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3467 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3468 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3469 = "torch.aten.transpose.int"(%3447, %3467, %3468) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3469, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3470 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3471 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3472 = "torch.aten.transpose.int"(%3463, %3470, %3471) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3472, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3473 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3474 = "torch.aten.squeeze.dim"(%1516, %3473) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3474, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %3475 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3476 = "torch.aten.squeeze.dim"(%3474, %3475) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3476, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %3477 = "torch_c.to_builtin_tensor"(%3466) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %3478 = "tensor.cast"(%3477) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3479 = "torch_c.to_builtin_tensor"(%3469) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3480 = "torch_c.to_builtin_tensor"(%3472) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3481 = "torch_c.to_builtin_tensor"(%3476) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %3482 = "tensor.cast"(%3481) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %3483 = "torch_c.to_builtin_tensor"(%278) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %3484 = "util.call"(%3478, %3479, %3480, %3483, %3482) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %3485 = "tensor.cast"(%3484) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %3486 = "torch_c.from_builtin_tensor"(%3485) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %3487 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3488 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3489 = "torch.aten.transpose.int"(%3486, %3487, %3488) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %3490 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3491 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3492 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3493 = "torch.prim.ListConstruct"(%3490, %3491, %3492) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3494 = "torch.aten.view"(%3489, %3493) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %3495 = "torch.aten.div.Tensor"(%3494, %280) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3496 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3497 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3498 = "torch.aten.clamp"(%3495, %3496, %3497) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %3499 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3500 = "torch.prims.convert_element_type"(%3498, %3499) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3501 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3502 = "torch.aten.unsqueeze"(%282, %3501) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %3503 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3504 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3505 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3506 = "torch.prim.ListConstruct"(%3503, %3504, %3505) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3507 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3508 = "torch.aten.expand"(%3502, %3506, %3507) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %3509 = "torch_c.to_builtin_tensor"(%3500) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3510 = "torch_c.to_builtin_tensor"(%3508) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %3511 = "util.call"(%3509, %3510) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %3512 = "torch_c.from_builtin_tensor"(%3511) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %3513 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3514 = "torch.prims.convert_element_type"(%3512, %3513) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3515 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3516 = "torch.aten.add.Tensor"(%3115, %3514, %3515) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3517 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %3518 = "torch.prims.convert_element_type"(%3516, %3517) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3519 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3520 = "torch.aten.pow.Tensor_Scalar"(%3518, %3519) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3521 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3522 = "torch.prim.ListConstruct"(%3521) : (!torch.int) -> !torch.list<int>
    %3523 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %3524 = "torch.constant.none"() : () -> !torch.none
    %3525 = "torch.aten.mean.dim"(%3520, %3522, %3523, %3524) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %3526 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %3527 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3528 = "torch.aten.add.Scalar"(%3525, %3526, %3527) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %3529 = "torch.aten.rsqrt"(%3528) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %3530 = "torch.aten.mul.Tensor"(%3518, %3529) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3531 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3532 = "torch.prims.convert_element_type"(%3530, %3531) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3533 = "torch.aten.mul.Tensor"(%284, %3532) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %3534 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3535 = "torch.prims.convert_element_type"(%3533, %3534) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3536 = "torch.aten.div.Tensor"(%3535, %286) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3537 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3538 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3539 = "torch.aten.clamp"(%3536, %3537, %3538) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3540 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3541 = "torch.prims.convert_element_type"(%3539, %3540) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3542 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3543 = "torch.aten.unsqueeze"(%288, %3542) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %3544 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3545 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %3546 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3547 = "torch.prim.ListConstruct"(%3544, %3545, %3546) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3548 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3549 = "torch.aten.expand"(%3543, %3547, %3548) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %3550 = "torch_c.to_builtin_tensor"(%3541) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3551 = "torch_c.to_builtin_tensor"(%3549) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %3552 = "util.call"(%3550, %3551) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %3553 = "torch_c.from_builtin_tensor"(%3552) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %3554 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3555 = "torch.prims.convert_element_type"(%3553, %3554) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %3556 = "torch.aten.silu"(%3555) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %3557 = "torch.aten.div.Tensor"(%3535, %290) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3558 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3559 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3560 = "torch.aten.clamp"(%3557, %3558, %3559) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3561 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3562 = "torch.prims.convert_element_type"(%3560, %3561) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3563 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3564 = "torch.aten.unsqueeze"(%292, %3563) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %3565 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3566 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %3567 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3568 = "torch.prim.ListConstruct"(%3565, %3566, %3567) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3569 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3570 = "torch.aten.expand"(%3564, %3568, %3569) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %3571 = "torch_c.to_builtin_tensor"(%3562) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3572 = "torch_c.to_builtin_tensor"(%3570) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %3573 = "util.call"(%3571, %3572) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %3574 = "torch_c.from_builtin_tensor"(%3573) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %3575 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3576 = "torch.prims.convert_element_type"(%3574, %3575) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %3577 = "torch.aten.mul.Tensor"(%3556, %3576) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %3578 = "torch.aten.div.Tensor"(%3577, %294) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %3579 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3580 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3581 = "torch.aten.clamp"(%3578, %3579, %3580) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %3582 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3583 = "torch.prims.convert_element_type"(%3581, %3582) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %3584 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3585 = "torch.aten.unsqueeze"(%296, %3584) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %3586 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3587 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3588 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %3589 = "torch.prim.ListConstruct"(%3586, %3587, %3588) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3590 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3591 = "torch.aten.expand"(%3585, %3589, %3590) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %3592 = "torch_c.to_builtin_tensor"(%3583) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %3593 = "torch_c.to_builtin_tensor"(%3591) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %3594 = "util.call"(%3592, %3593) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %3595 = "torch_c.from_builtin_tensor"(%3594) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %3596 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3597 = "torch.prims.convert_element_type"(%3595, %3596) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3598 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3599 = "torch.aten.add.Tensor"(%3516, %3597, %3598) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3600 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %3601 = "torch.prims.convert_element_type"(%3599, %3600) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3602 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3603 = "torch.aten.pow.Tensor_Scalar"(%3601, %3602) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %3604 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3605 = "torch.prim.ListConstruct"(%3604) : (!torch.int) -> !torch.list<int>
    %3606 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %3607 = "torch.constant.none"() : () -> !torch.none
    %3608 = "torch.aten.mean.dim"(%3603, %3605, %3606, %3607) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %3609 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %3610 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3611 = "torch.aten.add.Scalar"(%3608, %3609, %3610) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %3612 = "torch.aten.rsqrt"(%3611) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %3613 = "torch.aten.mul.Tensor"(%3601, %3612) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3614 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3615 = "torch.prims.convert_element_type"(%3613, %3614) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3616 = "torch.aten.mul.Tensor"(%298, %3615) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %3617 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3618 = "torch.prims.convert_element_type"(%3616, %3617) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3619 = "torch.aten.div.Tensor"(%3618, %300) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3620 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3621 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3622 = "torch.aten.clamp"(%3619, %3620, %3621) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3623 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3624 = "torch.prims.convert_element_type"(%3622, %3623) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3625 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3626 = "torch.aten.unsqueeze"(%302, %3625) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %3627 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3628 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3629 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3630 = "torch.prim.ListConstruct"(%3627, %3628, %3629) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3631 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3632 = "torch.aten.expand"(%3626, %3630, %3631) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %3633 = "torch_c.to_builtin_tensor"(%3624) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3634 = "torch_c.to_builtin_tensor"(%3632) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %3635 = "util.call"(%3633, %3634) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %3636 = "torch_c.from_builtin_tensor"(%3635) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %3637 = "torch.aten.div.Tensor"(%3636, %304) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3638 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3639 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3640 = "torch.aten.clamp"(%3637, %3638, %3639) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %3641 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3642 = "torch.prims.convert_element_type"(%3640, %3641) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3643 = "torch.aten.div.Tensor"(%3618, %306) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3644 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3645 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3646 = "torch.aten.clamp"(%3643, %3644, %3645) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3647 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3648 = "torch.prims.convert_element_type"(%3646, %3647) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3649 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3650 = "torch.aten.unsqueeze"(%308, %3649) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %3651 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3652 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %3653 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3654 = "torch.prim.ListConstruct"(%3651, %3652, %3653) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3655 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3656 = "torch.aten.expand"(%3650, %3654, %3655) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %3657 = "torch_c.to_builtin_tensor"(%3648) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3658 = "torch_c.to_builtin_tensor"(%3656) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %3659 = "util.call"(%3657, %3658) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %3660 = "torch_c.from_builtin_tensor"(%3659) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %3661 = "torch.aten.div.Tensor"(%3660, %310) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %3662 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3663 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3664 = "torch.aten.clamp"(%3661, %3662, %3663) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %3665 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3666 = "torch.prims.convert_element_type"(%3664, %3665) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %3667 = "torch.aten.div.Tensor"(%3618, %312) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %3668 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3669 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3670 = "torch.aten.clamp"(%3667, %3668, %3669) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %3671 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3672 = "torch.prims.convert_element_type"(%3670, %3671) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3673 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3674 = "torch.aten.unsqueeze"(%314, %3673) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %3675 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3676 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %3677 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3678 = "torch.prim.ListConstruct"(%3675, %3676, %3677) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3679 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3680 = "torch.aten.expand"(%3674, %3678, %3679) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %3681 = "torch_c.to_builtin_tensor"(%3672) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3682 = "torch_c.to_builtin_tensor"(%3680) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %3683 = "util.call"(%3681, %3682) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %3684 = "torch_c.from_builtin_tensor"(%3683) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %3685 = "torch.aten.div.Tensor"(%3684, %316) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %3686 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3687 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3688 = "torch.aten.clamp"(%3685, %3686, %3687) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %3689 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3690 = "torch.prims.convert_element_type"(%3688, %3689) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %3691 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3692 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3693 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3694 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3695 = "torch.prim.ListConstruct"(%3691, %3692, %3693, %3694) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3696 = "torch.aten.view"(%3642, %3695) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %3697 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3698 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3699 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3700 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3701 = "torch.prim.ListConstruct"(%3697, %3698, %3699, %3700) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3702 = "torch.aten.view"(%3666, %3701) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %3703 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3704 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3705 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3706 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3707 = "torch.prim.ListConstruct"(%3703, %3704, %3705, %3706) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3708 = "torch.aten.view"(%3690, %3707) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %3709 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3710 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3711 = "torch.aten.transpose.int"(%3696, %3709, %3710) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3712 = "torch.aten.mul.Tensor"(%3711, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3713 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3714 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3715 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3716 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3717 = "torch.aten.slice.Tensor"(%3711, %3713, %3714, %3715, %3716) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %3718 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3719 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3720 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3721 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3722 = "torch.aten.slice.Tensor"(%3711, %3718, %3719, %3720, %3721) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %3723 = "torch.aten.neg"(%3722) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %3724 = "torch.prim.ListConstruct"(%3723, %3717) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %3725 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3726 = "torch.aten.cat"(%3724, %3725) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3727 = "torch.aten.mul.Tensor"(%3726, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3728 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3729 = "torch.aten.add.Tensor"(%3712, %3727, %3728) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3730 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3731 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3732 = "torch.aten.transpose.int"(%3729, %3730, %3731) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %3733 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3734 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3735 = "torch.aten.transpose.int"(%3702, %3733, %3734) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3736 = "torch.aten.mul.Tensor"(%3735, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3737 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3738 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3739 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3740 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3741 = "torch.aten.slice.Tensor"(%3735, %3737, %3738, %3739, %3740) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %3742 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %3743 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3744 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3745 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3746 = "torch.aten.slice.Tensor"(%3735, %3742, %3743, %3744, %3745) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %3747 = "torch.aten.neg"(%3746) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %3748 = "torch.prim.ListConstruct"(%3747, %3741) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %3749 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %3750 = "torch.aten.cat"(%3748, %3749) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3751 = "torch.aten.mul.Tensor"(%3750, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3752 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3753 = "torch.aten.add.Tensor"(%3736, %3751, %3752) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %3754 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3755 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3756 = "torch.aten.transpose.int"(%3753, %3754, %3755) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %3757 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3758 = "torch.aten.floor_divide.Scalar"(%arg64, %3757) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3759 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3760 = "torch.aten.unsqueeze"(%3758, %3759) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3761 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3762 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3763 = "torch.aten.gather"(%arg65, %3761, %3760, %3762) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3764 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3765 = "torch.aten.remainder.Scalar"(%arg64, %3764) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3766 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3767 = "torch.aten.unsqueeze"(%3765, %3766) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3768 = "torch.constant.none"() : () -> !torch.none
    %3769 = "torch.aten.clone"(%317, %3768) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %3770 = "torch.aten.detach"(%3769) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3771 = "torch.aten.detach"(%3770) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3772 = "torch.aten.detach"(%3771) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3773 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3774 = "torch.aten.unsqueeze"(%3772, %3773) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %3775 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3776 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3777 = "torch.prim.ListConstruct"(%3775, %3776) : (!torch.int, !torch.int) -> !torch.list<int>
    %3778 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3780 = "torch.prim.ListConstruct"(%3778, %3779) : (!torch.int, !torch.int) -> !torch.list<int>
    %3781 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3782 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3783 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %3784 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3785 = "torch.aten.empty_strided"(%3777, %3780, %3781, %3782, %3783, %3784) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3786 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3787 = "torch.aten.fill.Scalar"(%3785, %3786) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3788 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3789 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3790 = "torch.prim.ListConstruct"(%3788, %3789) : (!torch.int, !torch.int) -> !torch.list<int>
    %3791 = "torch.aten.repeat"(%3774, %3790) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %3792 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3793 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3794 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3795 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3796 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3797 = "torch.prim.ListConstruct"(%1483, %3792, %3793, %3794, %3795, %3796) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3798 = "torch.aten.view"(%3368, %3797) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3798, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3799 = "torch.prim.ListConstruct"(%3763, %3787, %3791, %3767) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %3800 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3801 = "torch.aten.index_put"(%3798, %3799, %3756, %3800) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3801, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3802 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %3803 = "torch.prim.ListConstruct"(%1483, %3802) : (!torch.int, !torch.int) -> !torch.list<int>
    %3804 = "torch.aten.view"(%3801, %3803) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3804, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %3805 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3806 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3807 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3808 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3809 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3810 = "torch.prim.ListConstruct"(%1483, %3805, %3806, %3807, %3808, %3809) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3811 = "torch.aten.view"(%3804, %3810) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3811, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3812 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3813 = "torch.aten.floor_divide.Scalar"(%arg64, %3812) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3814 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3815 = "torch.aten.unsqueeze"(%3813, %3814) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3816 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3817 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3818 = "torch.aten.gather"(%arg65, %3816, %3815, %3817) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3819 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3820 = "torch.aten.remainder.Scalar"(%arg64, %3819) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %3821 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3822 = "torch.aten.unsqueeze"(%3820, %3821) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3823 = "torch.constant.none"() : () -> !torch.none
    %3824 = "torch.aten.clone"(%318, %3823) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %3825 = "torch.aten.detach"(%3824) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3826 = "torch.aten.detach"(%3825) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3827 = "torch.aten.detach"(%3826) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %3828 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3829 = "torch.aten.unsqueeze"(%3827, %3828) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %3830 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3831 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3832 = "torch.prim.ListConstruct"(%3830, %3831) : (!torch.int, !torch.int) -> !torch.list<int>
    %3833 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3834 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3835 = "torch.prim.ListConstruct"(%3833, %3834) : (!torch.int, !torch.int) -> !torch.list<int>
    %3836 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3837 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3838 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %3839 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3840 = "torch.aten.empty_strided"(%3832, %3835, %3836, %3837, %3838, %3839) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %3841 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3842 = "torch.aten.fill.Scalar"(%3840, %3841) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %3843 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3844 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3845 = "torch.prim.ListConstruct"(%3843, %3844) : (!torch.int, !torch.int) -> !torch.list<int>
    %3846 = "torch.aten.repeat"(%3829, %3845) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %3847 = "torch.prim.ListConstruct"(%3818, %3842, %3846, %3822) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %3848 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3849 = "torch.aten.index_put"(%3811, %3847, %3708, %3848) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3849, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3850 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %3851 = "torch.prim.ListConstruct"(%1483, %3850) : (!torch.int, !torch.int) -> !torch.list<int>
    %3852 = "torch.aten.view"(%3849, %3851) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3852, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %3853 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %3854 = "torch.aten.mul.Scalar"(%arg65, %3853) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3854, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3855 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3857 = "torch.aten.add.Scalar"(%3854, %3855, %3856) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3857, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3858 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3859 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3860 = "torch.aten.add.Scalar"(%3857, %3858, %3859) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3860, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3861 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %3862 = "torch.aten.view"(%3860, %3861) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%3862, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %3863 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3864 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3865 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3866 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3867 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3868 = "torch.prim.ListConstruct"(%1483, %3863, %3864, %3865, %3866, %3867) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3869 = "torch.aten.view"(%3852, %3868) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3869, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3870 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3871 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3872 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3873 = "torch.prim.ListConstruct"(%1914, %3870, %3871, %3872) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3874 = "torch.aten.view"(%3869, %3873) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3874, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3875 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3876 = "torch.aten.index_select"(%3874, %3875, %3862) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3876, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3877 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3878 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3879 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3880 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3881 = "torch.prim.ListConstruct"(%3877, %1481, %3878, %3879, %3880) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3882 = "torch.aten.view"(%3876, %3881) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3882, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3883 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3884 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3885 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3886 = "torch.prim.ListConstruct"(%3883, %1485, %3884, %3885) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3887 = "torch.aten.view"(%3882, %3886) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3887, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3889 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3890 = "torch.aten.add.Scalar"(%3857, %3888, %3889) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%3890, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %3891 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %3892 = "torch.aten.view"(%3890, %3891) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%3892, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %3893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3894 = "torch.aten.index_select"(%3874, %3893, %3892) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3894, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3895 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3896 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3897 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3898 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3899 = "torch.prim.ListConstruct"(%3895, %1481, %3896, %3897, %3898) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3900 = "torch.aten.view"(%3894, %3899) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3900, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3901 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3902 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3903 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3904 = "torch.prim.ListConstruct"(%3901, %1485, %3902, %3903) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3905 = "torch.aten.view"(%3900, %3904) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3905, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3906 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3907 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3908 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3909 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3910 = "torch.aten.slice.Tensor"(%3887, %3906, %3907, %3908, %3909) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3910, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3911 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3913 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %3914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3915 = "torch.aten.slice.Tensor"(%3905, %3911, %3912, %3913, %3914) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3915, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3916 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %3917 = "torch.aten.unsqueeze"(%3910, %3916) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3917, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3918 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3919 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3920 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3921 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3922 = "torch.prim.ListConstruct"(%3918, %1485, %3919, %3920, %3921) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3923 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3924 = "torch.aten.expand"(%3917, %3922, %3923) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3924, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3925 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3926 = "torch.aten.clone"(%3924, %3925) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3926, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3927 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3928 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3929 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3930 = "torch.prim.ListConstruct"(%3927, %1485, %3928, %3929) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3931 = "torch.aten._unsafe_view"(%3926, %3930) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3931, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3932 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %3933 = "torch.aten.unsqueeze"(%3915, %3932) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3933, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3934 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3935 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %3936 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3937 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3938 = "torch.prim.ListConstruct"(%3934, %1485, %3935, %3936, %3937) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3939 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3940 = "torch.aten.expand"(%3933, %3938, %3939) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3940, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3941 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3942 = "torch.aten.clone"(%3940, %3941) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3942, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3943 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3944 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %3945 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %3946 = "torch.prim.ListConstruct"(%3943, %1485, %3944, %3945) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3947 = "torch.aten._unsafe_view"(%3942, %3946) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3947, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3948 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3949 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3950 = "torch.aten.transpose.int"(%3732, %3948, %3949) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %3951 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3952 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3953 = "torch.aten.transpose.int"(%3931, %3951, %3952) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3953, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3954 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3955 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3956 = "torch.aten.transpose.int"(%3947, %3954, %3955) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3956, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %3957 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3958 = "torch.aten.squeeze.dim"(%1516, %3957) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3958, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %3959 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3960 = "torch.aten.squeeze.dim"(%3958, %3959) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%3960, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %3961 = "torch_c.to_builtin_tensor"(%3950) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %3962 = "tensor.cast"(%3961) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3963 = "torch_c.to_builtin_tensor"(%3953) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3964 = "torch_c.to_builtin_tensor"(%3956) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3965 = "torch_c.to_builtin_tensor"(%3960) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %3966 = "tensor.cast"(%3965) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %3967 = "torch_c.to_builtin_tensor"(%320) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %3968 = "util.call"(%3962, %3963, %3964, %3967, %3966) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %3969 = "tensor.cast"(%3968) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %3970 = "torch_c.from_builtin_tensor"(%3969) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %3971 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3972 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %3973 = "torch.aten.transpose.int"(%3970, %3971, %3972) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %3974 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3975 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %3976 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3977 = "torch.prim.ListConstruct"(%3974, %3975, %3976) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3978 = "torch.aten.view"(%3973, %3977) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %3979 = "torch.aten.div.Tensor"(%3978, %322) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %3980 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %3981 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %3982 = "torch.aten.clamp"(%3979, %3980, %3981) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %3983 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %3984 = "torch.prims.convert_element_type"(%3982, %3983) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %3985 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %3986 = "torch.aten.unsqueeze"(%324, %3985) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %3987 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %3988 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3989 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %3990 = "torch.prim.ListConstruct"(%3987, %3988, %3989) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3991 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %3992 = "torch.aten.expand"(%3986, %3990, %3991) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %3993 = "torch_c.to_builtin_tensor"(%3984) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %3994 = "torch_c.to_builtin_tensor"(%3992) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %3995 = "util.call"(%3993, %3994) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %3996 = "torch_c.from_builtin_tensor"(%3995) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %3997 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %3998 = "torch.prims.convert_element_type"(%3996, %3997) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %3999 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4000 = "torch.aten.add.Tensor"(%3599, %3998, %3999) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4001 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %4002 = "torch.prims.convert_element_type"(%4000, %4001) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4003 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4004 = "torch.aten.pow.Tensor_Scalar"(%4002, %4003) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4005 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4006 = "torch.prim.ListConstruct"(%4005) : (!torch.int) -> !torch.list<int>
    %4007 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %4008 = "torch.constant.none"() : () -> !torch.none
    %4009 = "torch.aten.mean.dim"(%4004, %4006, %4007, %4008) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %4010 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %4011 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4012 = "torch.aten.add.Scalar"(%4009, %4010, %4011) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %4013 = "torch.aten.rsqrt"(%4012) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %4014 = "torch.aten.mul.Tensor"(%4002, %4013) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4015 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4016 = "torch.prims.convert_element_type"(%4014, %4015) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4017 = "torch.aten.mul.Tensor"(%326, %4016) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %4018 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4019 = "torch.prims.convert_element_type"(%4017, %4018) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4020 = "torch.aten.div.Tensor"(%4019, %328) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4021 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4022 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4023 = "torch.aten.clamp"(%4020, %4021, %4022) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4024 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4025 = "torch.prims.convert_element_type"(%4023, %4024) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4026 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4027 = "torch.aten.unsqueeze"(%330, %4026) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %4028 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4029 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %4030 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4031 = "torch.prim.ListConstruct"(%4028, %4029, %4030) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4032 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4033 = "torch.aten.expand"(%4027, %4031, %4032) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %4034 = "torch_c.to_builtin_tensor"(%4025) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4035 = "torch_c.to_builtin_tensor"(%4033) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %4036 = "util.call"(%4034, %4035) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %4037 = "torch_c.from_builtin_tensor"(%4036) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %4038 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4039 = "torch.prims.convert_element_type"(%4037, %4038) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %4040 = "torch.aten.silu"(%4039) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %4041 = "torch.aten.div.Tensor"(%4019, %332) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4042 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4043 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4044 = "torch.aten.clamp"(%4041, %4042, %4043) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4045 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4046 = "torch.prims.convert_element_type"(%4044, %4045) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4047 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4048 = "torch.aten.unsqueeze"(%334, %4047) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %4049 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4050 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %4051 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4052 = "torch.prim.ListConstruct"(%4049, %4050, %4051) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4053 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4054 = "torch.aten.expand"(%4048, %4052, %4053) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %4055 = "torch_c.to_builtin_tensor"(%4046) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4056 = "torch_c.to_builtin_tensor"(%4054) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %4057 = "util.call"(%4055, %4056) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %4058 = "torch_c.from_builtin_tensor"(%4057) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %4059 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4060 = "torch.prims.convert_element_type"(%4058, %4059) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %4061 = "torch.aten.mul.Tensor"(%4040, %4060) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %4062 = "torch.aten.div.Tensor"(%4061, %336) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %4063 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4064 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4065 = "torch.aten.clamp"(%4062, %4063, %4064) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %4066 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4067 = "torch.prims.convert_element_type"(%4065, %4066) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %4068 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4069 = "torch.aten.unsqueeze"(%338, %4068) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %4070 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4071 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4072 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %4073 = "torch.prim.ListConstruct"(%4070, %4071, %4072) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4074 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4075 = "torch.aten.expand"(%4069, %4073, %4074) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %4076 = "torch_c.to_builtin_tensor"(%4067) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %4077 = "torch_c.to_builtin_tensor"(%4075) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %4078 = "util.call"(%4076, %4077) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %4079 = "torch_c.from_builtin_tensor"(%4078) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %4080 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4081 = "torch.prims.convert_element_type"(%4079, %4080) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4082 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4083 = "torch.aten.add.Tensor"(%4000, %4081, %4082) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4084 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %4085 = "torch.prims.convert_element_type"(%4083, %4084) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4086 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4087 = "torch.aten.pow.Tensor_Scalar"(%4085, %4086) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4088 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4089 = "torch.prim.ListConstruct"(%4088) : (!torch.int) -> !torch.list<int>
    %4090 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %4091 = "torch.constant.none"() : () -> !torch.none
    %4092 = "torch.aten.mean.dim"(%4087, %4089, %4090, %4091) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %4093 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %4094 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4095 = "torch.aten.add.Scalar"(%4092, %4093, %4094) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %4096 = "torch.aten.rsqrt"(%4095) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %4097 = "torch.aten.mul.Tensor"(%4085, %4096) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4098 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4099 = "torch.prims.convert_element_type"(%4097, %4098) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4100 = "torch.aten.mul.Tensor"(%340, %4099) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %4101 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4102 = "torch.prims.convert_element_type"(%4100, %4101) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4103 = "torch.aten.div.Tensor"(%4102, %342) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4104 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4105 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4106 = "torch.aten.clamp"(%4103, %4104, %4105) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4107 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4108 = "torch.prims.convert_element_type"(%4106, %4107) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4109 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4110 = "torch.aten.unsqueeze"(%344, %4109) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %4111 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4112 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4113 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4114 = "torch.prim.ListConstruct"(%4111, %4112, %4113) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4115 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4116 = "torch.aten.expand"(%4110, %4114, %4115) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %4117 = "torch_c.to_builtin_tensor"(%4108) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4118 = "torch_c.to_builtin_tensor"(%4116) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %4119 = "util.call"(%4117, %4118) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %4120 = "torch_c.from_builtin_tensor"(%4119) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %4121 = "torch.aten.div.Tensor"(%4120, %346) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4122 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4123 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4124 = "torch.aten.clamp"(%4121, %4122, %4123) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %4125 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4126 = "torch.prims.convert_element_type"(%4124, %4125) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4127 = "torch.aten.div.Tensor"(%4102, %348) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4128 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4129 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4130 = "torch.aten.clamp"(%4127, %4128, %4129) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4131 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4132 = "torch.prims.convert_element_type"(%4130, %4131) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4133 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4134 = "torch.aten.unsqueeze"(%350, %4133) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %4135 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4136 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %4137 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4138 = "torch.prim.ListConstruct"(%4135, %4136, %4137) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4139 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4140 = "torch.aten.expand"(%4134, %4138, %4139) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %4141 = "torch_c.to_builtin_tensor"(%4132) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4142 = "torch_c.to_builtin_tensor"(%4140) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %4143 = "util.call"(%4141, %4142) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %4144 = "torch_c.from_builtin_tensor"(%4143) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %4145 = "torch.aten.div.Tensor"(%4144, %352) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %4146 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4147 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4148 = "torch.aten.clamp"(%4145, %4146, %4147) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %4149 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4150 = "torch.prims.convert_element_type"(%4148, %4149) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %4151 = "torch.aten.div.Tensor"(%4102, %354) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4152 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4153 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4154 = "torch.aten.clamp"(%4151, %4152, %4153) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4155 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4156 = "torch.prims.convert_element_type"(%4154, %4155) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4157 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4158 = "torch.aten.unsqueeze"(%356, %4157) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %4159 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4160 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %4161 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4162 = "torch.prim.ListConstruct"(%4159, %4160, %4161) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4163 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4164 = "torch.aten.expand"(%4158, %4162, %4163) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %4165 = "torch_c.to_builtin_tensor"(%4156) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4166 = "torch_c.to_builtin_tensor"(%4164) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %4167 = "util.call"(%4165, %4166) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %4168 = "torch_c.from_builtin_tensor"(%4167) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %4169 = "torch.aten.div.Tensor"(%4168, %358) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %4170 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4171 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4172 = "torch.aten.clamp"(%4169, %4170, %4171) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %4173 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4174 = "torch.prims.convert_element_type"(%4172, %4173) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %4175 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4176 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4177 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4178 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4179 = "torch.prim.ListConstruct"(%4175, %4176, %4177, %4178) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4180 = "torch.aten.view"(%4126, %4179) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %4181 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4183 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4184 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4185 = "torch.prim.ListConstruct"(%4181, %4182, %4183, %4184) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4186 = "torch.aten.view"(%4150, %4185) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %4187 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4188 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4189 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4190 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4191 = "torch.prim.ListConstruct"(%4187, %4188, %4189, %4190) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4192 = "torch.aten.view"(%4174, %4191) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %4193 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4194 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4195 = "torch.aten.transpose.int"(%4180, %4193, %4194) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4196 = "torch.aten.mul.Tensor"(%4195, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4197 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4198 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4199 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4200 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4201 = "torch.aten.slice.Tensor"(%4195, %4197, %4198, %4199, %4200) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %4202 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4203 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4204 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4205 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4206 = "torch.aten.slice.Tensor"(%4195, %4202, %4203, %4204, %4205) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %4207 = "torch.aten.neg"(%4206) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %4208 = "torch.prim.ListConstruct"(%4207, %4201) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %4209 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4210 = "torch.aten.cat"(%4208, %4209) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4211 = "torch.aten.mul.Tensor"(%4210, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4212 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4213 = "torch.aten.add.Tensor"(%4196, %4211, %4212) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4215 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4216 = "torch.aten.transpose.int"(%4213, %4214, %4215) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %4217 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4218 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4219 = "torch.aten.transpose.int"(%4186, %4217, %4218) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4220 = "torch.aten.mul.Tensor"(%4219, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4221 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4222 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4223 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4224 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4225 = "torch.aten.slice.Tensor"(%4219, %4221, %4222, %4223, %4224) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %4226 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4227 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4228 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4229 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4230 = "torch.aten.slice.Tensor"(%4219, %4226, %4227, %4228, %4229) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %4231 = "torch.aten.neg"(%4230) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %4232 = "torch.prim.ListConstruct"(%4231, %4225) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %4233 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4234 = "torch.aten.cat"(%4232, %4233) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4235 = "torch.aten.mul.Tensor"(%4234, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4236 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4237 = "torch.aten.add.Tensor"(%4220, %4235, %4236) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4239 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4240 = "torch.aten.transpose.int"(%4237, %4238, %4239) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %4241 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4242 = "torch.aten.floor_divide.Scalar"(%arg64, %4241) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4243 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4244 = "torch.aten.unsqueeze"(%4242, %4243) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4245 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4246 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4247 = "torch.aten.gather"(%arg65, %4245, %4244, %4246) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4248 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4249 = "torch.aten.remainder.Scalar"(%arg64, %4248) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4251 = "torch.aten.unsqueeze"(%4249, %4250) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4252 = "torch.constant.none"() : () -> !torch.none
    %4253 = "torch.aten.clone"(%359, %4252) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %4254 = "torch.aten.detach"(%4253) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4255 = "torch.aten.detach"(%4254) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4256 = "torch.aten.detach"(%4255) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4257 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4258 = "torch.aten.unsqueeze"(%4256, %4257) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %4259 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4260 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4261 = "torch.prim.ListConstruct"(%4259, %4260) : (!torch.int, !torch.int) -> !torch.list<int>
    %4262 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4263 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4264 = "torch.prim.ListConstruct"(%4262, %4263) : (!torch.int, !torch.int) -> !torch.list<int>
    %4265 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4266 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4267 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %4268 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4269 = "torch.aten.empty_strided"(%4261, %4264, %4265, %4266, %4267, %4268) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4270 = "torch.constant.int"() <{value = 5 : i64}> : () -> !torch.int
    %4271 = "torch.aten.fill.Scalar"(%4269, %4270) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4272 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4273 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4274 = "torch.prim.ListConstruct"(%4272, %4273) : (!torch.int, !torch.int) -> !torch.list<int>
    %4275 = "torch.aten.repeat"(%4258, %4274) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %4276 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4277 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4278 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4279 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4280 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4281 = "torch.prim.ListConstruct"(%1483, %4276, %4277, %4278, %4279, %4280) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4282 = "torch.aten.view"(%3852, %4281) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4282, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4283 = "torch.prim.ListConstruct"(%4247, %4271, %4275, %4251) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %4284 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4285 = "torch.aten.index_put"(%4282, %4283, %4240, %4284) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4285, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4286 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %4287 = "torch.prim.ListConstruct"(%1483, %4286) : (!torch.int, !torch.int) -> !torch.list<int>
    %4288 = "torch.aten.view"(%4285, %4287) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4288, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %4289 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4290 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4291 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4292 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4293 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4294 = "torch.prim.ListConstruct"(%1483, %4289, %4290, %4291, %4292, %4293) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4295 = "torch.aten.view"(%4288, %4294) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4295, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4296 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4297 = "torch.aten.floor_divide.Scalar"(%arg64, %4296) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4298 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4299 = "torch.aten.unsqueeze"(%4297, %4298) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4300 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4301 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4302 = "torch.aten.gather"(%arg65, %4300, %4299, %4301) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4303 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4304 = "torch.aten.remainder.Scalar"(%arg64, %4303) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4305 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4306 = "torch.aten.unsqueeze"(%4304, %4305) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4307 = "torch.constant.none"() : () -> !torch.none
    %4308 = "torch.aten.clone"(%360, %4307) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %4309 = "torch.aten.detach"(%4308) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4310 = "torch.aten.detach"(%4309) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4311 = "torch.aten.detach"(%4310) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4312 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4313 = "torch.aten.unsqueeze"(%4311, %4312) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %4314 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4315 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4316 = "torch.prim.ListConstruct"(%4314, %4315) : (!torch.int, !torch.int) -> !torch.list<int>
    %4317 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4318 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4319 = "torch.prim.ListConstruct"(%4317, %4318) : (!torch.int, !torch.int) -> !torch.list<int>
    %4320 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4321 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4322 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %4323 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4324 = "torch.aten.empty_strided"(%4316, %4319, %4320, %4321, %4322, %4323) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4325 = "torch.constant.int"() <{value = 5 : i64}> : () -> !torch.int
    %4326 = "torch.aten.fill.Scalar"(%4324, %4325) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4327 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4328 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4329 = "torch.prim.ListConstruct"(%4327, %4328) : (!torch.int, !torch.int) -> !torch.list<int>
    %4330 = "torch.aten.repeat"(%4313, %4329) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %4331 = "torch.prim.ListConstruct"(%4302, %4326, %4330, %4306) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %4332 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4333 = "torch.aten.index_put"(%4295, %4331, %4192, %4332) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4333, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4334 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %4335 = "torch.prim.ListConstruct"(%1483, %4334) : (!torch.int, !torch.int) -> !torch.list<int>
    %4336 = "torch.aten.view"(%4333, %4335) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4336, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %4337 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4338 = "torch.aten.mul.Scalar"(%arg65, %4337) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4338, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4339 = "torch.constant.int"() <{value = 10 : i64}> : () -> !torch.int
    %4340 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4341 = "torch.aten.add.Scalar"(%4338, %4339, %4340) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4341, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4342 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4343 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4344 = "torch.aten.add.Scalar"(%4341, %4342, %4343) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4344, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4345 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %4346 = "torch.aten.view"(%4344, %4345) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%4346, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %4347 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4348 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4349 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4350 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4351 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4352 = "torch.prim.ListConstruct"(%1483, %4347, %4348, %4349, %4350, %4351) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4353 = "torch.aten.view"(%4336, %4352) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4353, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4354 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4355 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4356 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4357 = "torch.prim.ListConstruct"(%1914, %4354, %4355, %4356) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4358 = "torch.aten.view"(%4353, %4357) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4358, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4359 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4360 = "torch.aten.index_select"(%4358, %4359, %4346) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4360, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4361 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4362 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4363 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4364 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4365 = "torch.prim.ListConstruct"(%4361, %1481, %4362, %4363, %4364) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4366 = "torch.aten.view"(%4360, %4365) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4366, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4367 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4368 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4369 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4370 = "torch.prim.ListConstruct"(%4367, %1485, %4368, %4369) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4371 = "torch.aten.view"(%4366, %4370) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4371, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4372 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4373 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4374 = "torch.aten.add.Scalar"(%4341, %4372, %4373) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4374, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4375 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %4376 = "torch.aten.view"(%4374, %4375) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%4376, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %4377 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4378 = "torch.aten.index_select"(%4358, %4377, %4376) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4378, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4379 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4380 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4381 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4382 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4383 = "torch.prim.ListConstruct"(%4379, %1481, %4380, %4381, %4382) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4384 = "torch.aten.view"(%4378, %4383) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4384, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4385 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4386 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4387 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4388 = "torch.prim.ListConstruct"(%4385, %1485, %4386, %4387) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4389 = "torch.aten.view"(%4384, %4388) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4389, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4390 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4391 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4392 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4393 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4394 = "torch.aten.slice.Tensor"(%4371, %4390, %4391, %4392, %4393) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4394, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4395 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4396 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4397 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4398 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4399 = "torch.aten.slice.Tensor"(%4389, %4395, %4396, %4397, %4398) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4399, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4400 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %4401 = "torch.aten.unsqueeze"(%4394, %4400) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4401, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4402 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4403 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4404 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4405 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4406 = "torch.prim.ListConstruct"(%4402, %1485, %4403, %4404, %4405) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4407 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4408 = "torch.aten.expand"(%4401, %4406, %4407) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4408, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4409 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4410 = "torch.aten.clone"(%4408, %4409) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4410, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4411 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4412 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4413 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4414 = "torch.prim.ListConstruct"(%4411, %1485, %4412, %4413) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4415 = "torch.aten._unsafe_view"(%4410, %4414) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4415, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4416 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %4417 = "torch.aten.unsqueeze"(%4399, %4416) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4417, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4418 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4419 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4420 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4421 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4422 = "torch.prim.ListConstruct"(%4418, %1485, %4419, %4420, %4421) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4423 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4424 = "torch.aten.expand"(%4417, %4422, %4423) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4424, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4425 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4426 = "torch.aten.clone"(%4424, %4425) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4426, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4427 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4428 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4429 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4430 = "torch.prim.ListConstruct"(%4427, %1485, %4428, %4429) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4431 = "torch.aten._unsafe_view"(%4426, %4430) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4431, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4432 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4433 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4434 = "torch.aten.transpose.int"(%4216, %4432, %4433) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4435 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4436 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4437 = "torch.aten.transpose.int"(%4415, %4435, %4436) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4437, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4438 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4439 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4440 = "torch.aten.transpose.int"(%4431, %4438, %4439) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4440, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4441 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4442 = "torch.aten.squeeze.dim"(%1516, %4441) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4442, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %4443 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4444 = "torch.aten.squeeze.dim"(%4442, %4443) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4444, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %4445 = "torch_c.to_builtin_tensor"(%4434) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %4446 = "tensor.cast"(%4445) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %4447 = "torch_c.to_builtin_tensor"(%4437) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %4448 = "torch_c.to_builtin_tensor"(%4440) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %4449 = "torch_c.to_builtin_tensor"(%4444) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %4450 = "tensor.cast"(%4449) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %4451 = "torch_c.to_builtin_tensor"(%362) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %4452 = "util.call"(%4446, %4447, %4448, %4451, %4450) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %4453 = "tensor.cast"(%4452) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %4454 = "torch_c.from_builtin_tensor"(%4453) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %4455 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4456 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4457 = "torch.aten.transpose.int"(%4454, %4455, %4456) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %4458 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4459 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4460 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4461 = "torch.prim.ListConstruct"(%4458, %4459, %4460) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4462 = "torch.aten.view"(%4457, %4461) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %4463 = "torch.aten.div.Tensor"(%4462, %364) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4464 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4465 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4466 = "torch.aten.clamp"(%4463, %4464, %4465) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %4467 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4468 = "torch.prims.convert_element_type"(%4466, %4467) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4469 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4470 = "torch.aten.unsqueeze"(%366, %4469) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %4471 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4472 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4473 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4474 = "torch.prim.ListConstruct"(%4471, %4472, %4473) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4475 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4476 = "torch.aten.expand"(%4470, %4474, %4475) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %4477 = "torch_c.to_builtin_tensor"(%4468) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4478 = "torch_c.to_builtin_tensor"(%4476) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %4479 = "util.call"(%4477, %4478) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %4480 = "torch_c.from_builtin_tensor"(%4479) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %4481 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4482 = "torch.prims.convert_element_type"(%4480, %4481) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4483 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4484 = "torch.aten.add.Tensor"(%4083, %4482, %4483) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4485 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %4486 = "torch.prims.convert_element_type"(%4484, %4485) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4487 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4488 = "torch.aten.pow.Tensor_Scalar"(%4486, %4487) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4489 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4490 = "torch.prim.ListConstruct"(%4489) : (!torch.int) -> !torch.list<int>
    %4491 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %4492 = "torch.constant.none"() : () -> !torch.none
    %4493 = "torch.aten.mean.dim"(%4488, %4490, %4491, %4492) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %4494 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %4495 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4496 = "torch.aten.add.Scalar"(%4493, %4494, %4495) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %4497 = "torch.aten.rsqrt"(%4496) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %4498 = "torch.aten.mul.Tensor"(%4486, %4497) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4499 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4500 = "torch.prims.convert_element_type"(%4498, %4499) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4501 = "torch.aten.mul.Tensor"(%368, %4500) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %4502 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4503 = "torch.prims.convert_element_type"(%4501, %4502) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4504 = "torch.aten.div.Tensor"(%4503, %370) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4505 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4506 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4507 = "torch.aten.clamp"(%4504, %4505, %4506) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4508 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4509 = "torch.prims.convert_element_type"(%4507, %4508) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4510 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4511 = "torch.aten.unsqueeze"(%372, %4510) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %4512 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4513 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %4514 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4515 = "torch.prim.ListConstruct"(%4512, %4513, %4514) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4516 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4517 = "torch.aten.expand"(%4511, %4515, %4516) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %4518 = "torch_c.to_builtin_tensor"(%4509) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4519 = "torch_c.to_builtin_tensor"(%4517) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %4520 = "util.call"(%4518, %4519) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %4521 = "torch_c.from_builtin_tensor"(%4520) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %4522 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4523 = "torch.prims.convert_element_type"(%4521, %4522) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %4524 = "torch.aten.silu"(%4523) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %4525 = "torch.aten.div.Tensor"(%4503, %374) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4526 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4527 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4528 = "torch.aten.clamp"(%4525, %4526, %4527) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4529 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4530 = "torch.prims.convert_element_type"(%4528, %4529) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4531 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4532 = "torch.aten.unsqueeze"(%376, %4531) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %4533 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4534 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %4535 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4536 = "torch.prim.ListConstruct"(%4533, %4534, %4535) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4537 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4538 = "torch.aten.expand"(%4532, %4536, %4537) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %4539 = "torch_c.to_builtin_tensor"(%4530) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4540 = "torch_c.to_builtin_tensor"(%4538) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %4541 = "util.call"(%4539, %4540) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %4542 = "torch_c.from_builtin_tensor"(%4541) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %4543 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4544 = "torch.prims.convert_element_type"(%4542, %4543) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %4545 = "torch.aten.mul.Tensor"(%4524, %4544) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %4546 = "torch.aten.div.Tensor"(%4545, %378) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %4547 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4548 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4549 = "torch.aten.clamp"(%4546, %4547, %4548) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %4550 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4551 = "torch.prims.convert_element_type"(%4549, %4550) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %4552 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4553 = "torch.aten.unsqueeze"(%380, %4552) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %4554 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4555 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4556 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %4557 = "torch.prim.ListConstruct"(%4554, %4555, %4556) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4558 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4559 = "torch.aten.expand"(%4553, %4557, %4558) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %4560 = "torch_c.to_builtin_tensor"(%4551) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %4561 = "torch_c.to_builtin_tensor"(%4559) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %4562 = "util.call"(%4560, %4561) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %4563 = "torch_c.from_builtin_tensor"(%4562) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %4564 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4565 = "torch.prims.convert_element_type"(%4563, %4564) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4566 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4567 = "torch.aten.add.Tensor"(%4484, %4565, %4566) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4568 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %4569 = "torch.prims.convert_element_type"(%4567, %4568) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4570 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4571 = "torch.aten.pow.Tensor_Scalar"(%4569, %4570) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4572 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4573 = "torch.prim.ListConstruct"(%4572) : (!torch.int) -> !torch.list<int>
    %4574 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %4575 = "torch.constant.none"() : () -> !torch.none
    %4576 = "torch.aten.mean.dim"(%4571, %4573, %4574, %4575) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %4577 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %4578 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4579 = "torch.aten.add.Scalar"(%4576, %4577, %4578) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %4580 = "torch.aten.rsqrt"(%4579) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %4581 = "torch.aten.mul.Tensor"(%4569, %4580) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4582 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4583 = "torch.prims.convert_element_type"(%4581, %4582) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4584 = "torch.aten.mul.Tensor"(%382, %4583) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %4585 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4586 = "torch.prims.convert_element_type"(%4584, %4585) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4587 = "torch.aten.div.Tensor"(%4586, %384) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4588 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4589 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4590 = "torch.aten.clamp"(%4587, %4588, %4589) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4591 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4592 = "torch.prims.convert_element_type"(%4590, %4591) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4593 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4594 = "torch.aten.unsqueeze"(%386, %4593) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %4595 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4596 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4597 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4598 = "torch.prim.ListConstruct"(%4595, %4596, %4597) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4599 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4600 = "torch.aten.expand"(%4594, %4598, %4599) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %4601 = "torch_c.to_builtin_tensor"(%4592) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4602 = "torch_c.to_builtin_tensor"(%4600) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %4603 = "util.call"(%4601, %4602) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %4604 = "torch_c.from_builtin_tensor"(%4603) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %4605 = "torch.aten.div.Tensor"(%4604, %388) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4606 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4607 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4608 = "torch.aten.clamp"(%4605, %4606, %4607) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %4609 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4610 = "torch.prims.convert_element_type"(%4608, %4609) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4611 = "torch.aten.div.Tensor"(%4586, %390) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4612 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4613 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4614 = "torch.aten.clamp"(%4611, %4612, %4613) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4615 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4616 = "torch.prims.convert_element_type"(%4614, %4615) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4617 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4618 = "torch.aten.unsqueeze"(%392, %4617) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %4619 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4620 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %4621 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4622 = "torch.prim.ListConstruct"(%4619, %4620, %4621) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4623 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4624 = "torch.aten.expand"(%4618, %4622, %4623) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %4625 = "torch_c.to_builtin_tensor"(%4616) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4626 = "torch_c.to_builtin_tensor"(%4624) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %4627 = "util.call"(%4625, %4626) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %4628 = "torch_c.from_builtin_tensor"(%4627) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %4629 = "torch.aten.div.Tensor"(%4628, %394) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %4630 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4631 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4632 = "torch.aten.clamp"(%4629, %4630, %4631) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %4633 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4634 = "torch.prims.convert_element_type"(%4632, %4633) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %4635 = "torch.aten.div.Tensor"(%4586, %396) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4636 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4637 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4638 = "torch.aten.clamp"(%4635, %4636, %4637) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4639 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4640 = "torch.prims.convert_element_type"(%4638, %4639) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4641 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4642 = "torch.aten.unsqueeze"(%398, %4641) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %4643 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4644 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %4645 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4646 = "torch.prim.ListConstruct"(%4643, %4644, %4645) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4647 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4648 = "torch.aten.expand"(%4642, %4646, %4647) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %4649 = "torch_c.to_builtin_tensor"(%4640) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4650 = "torch_c.to_builtin_tensor"(%4648) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %4651 = "util.call"(%4649, %4650) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %4652 = "torch_c.from_builtin_tensor"(%4651) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %4653 = "torch.aten.div.Tensor"(%4652, %400) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %4654 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4655 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4656 = "torch.aten.clamp"(%4653, %4654, %4655) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %4657 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4658 = "torch.prims.convert_element_type"(%4656, %4657) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %4659 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4660 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4661 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4662 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4663 = "torch.prim.ListConstruct"(%4659, %4660, %4661, %4662) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4664 = "torch.aten.view"(%4610, %4663) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %4665 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4667 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4668 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4669 = "torch.prim.ListConstruct"(%4665, %4666, %4667, %4668) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4670 = "torch.aten.view"(%4634, %4669) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %4671 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4672 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4673 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4674 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4675 = "torch.prim.ListConstruct"(%4671, %4672, %4673, %4674) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4676 = "torch.aten.view"(%4658, %4675) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %4677 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4678 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4679 = "torch.aten.transpose.int"(%4664, %4677, %4678) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4680 = "torch.aten.mul.Tensor"(%4679, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4681 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4682 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4683 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4684 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4685 = "torch.aten.slice.Tensor"(%4679, %4681, %4682, %4683, %4684) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %4686 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4687 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4688 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4689 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4690 = "torch.aten.slice.Tensor"(%4679, %4686, %4687, %4688, %4689) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %4691 = "torch.aten.neg"(%4690) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %4692 = "torch.prim.ListConstruct"(%4691, %4685) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %4693 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4694 = "torch.aten.cat"(%4692, %4693) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4695 = "torch.aten.mul.Tensor"(%4694, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4696 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4697 = "torch.aten.add.Tensor"(%4680, %4695, %4696) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4698 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4699 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4700 = "torch.aten.transpose.int"(%4697, %4698, %4699) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %4701 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4702 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4703 = "torch.aten.transpose.int"(%4670, %4701, %4702) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4704 = "torch.aten.mul.Tensor"(%4703, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4705 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4706 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4707 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4708 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4709 = "torch.aten.slice.Tensor"(%4703, %4705, %4706, %4707, %4708) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %4710 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %4711 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4712 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4713 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4714 = "torch.aten.slice.Tensor"(%4703, %4710, %4711, %4712, %4713) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %4715 = "torch.aten.neg"(%4714) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %4716 = "torch.prim.ListConstruct"(%4715, %4709) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %4717 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4718 = "torch.aten.cat"(%4716, %4717) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4719 = "torch.aten.mul.Tensor"(%4718, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4720 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4721 = "torch.aten.add.Tensor"(%4704, %4719, %4720) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %4722 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4723 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4724 = "torch.aten.transpose.int"(%4721, %4722, %4723) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %4725 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4726 = "torch.aten.floor_divide.Scalar"(%arg64, %4725) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4727 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4728 = "torch.aten.unsqueeze"(%4726, %4727) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4729 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4730 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4731 = "torch.aten.gather"(%arg65, %4729, %4728, %4730) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4732 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4733 = "torch.aten.remainder.Scalar"(%arg64, %4732) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4734 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4735 = "torch.aten.unsqueeze"(%4733, %4734) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4736 = "torch.constant.none"() : () -> !torch.none
    %4737 = "torch.aten.clone"(%401, %4736) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %4738 = "torch.aten.detach"(%4737) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4739 = "torch.aten.detach"(%4738) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4740 = "torch.aten.detach"(%4739) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4741 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4742 = "torch.aten.unsqueeze"(%4740, %4741) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %4743 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4744 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4745 = "torch.prim.ListConstruct"(%4743, %4744) : (!torch.int, !torch.int) -> !torch.list<int>
    %4746 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4747 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4748 = "torch.prim.ListConstruct"(%4746, %4747) : (!torch.int, !torch.int) -> !torch.list<int>
    %4749 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4750 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4751 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %4752 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4753 = "torch.aten.empty_strided"(%4745, %4748, %4749, %4750, %4751, %4752) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4754 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %4755 = "torch.aten.fill.Scalar"(%4753, %4754) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4756 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4757 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4758 = "torch.prim.ListConstruct"(%4756, %4757) : (!torch.int, !torch.int) -> !torch.list<int>
    %4759 = "torch.aten.repeat"(%4742, %4758) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %4760 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4761 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4762 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4763 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4764 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4765 = "torch.prim.ListConstruct"(%1483, %4760, %4761, %4762, %4763, %4764) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4766 = "torch.aten.view"(%4336, %4765) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4766, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4767 = "torch.prim.ListConstruct"(%4731, %4755, %4759, %4735) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %4768 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4769 = "torch.aten.index_put"(%4766, %4767, %4724, %4768) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4769, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4770 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %4771 = "torch.prim.ListConstruct"(%1483, %4770) : (!torch.int, !torch.int) -> !torch.list<int>
    %4772 = "torch.aten.view"(%4769, %4771) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4772, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %4773 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4774 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4775 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4776 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4777 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4778 = "torch.prim.ListConstruct"(%1483, %4773, %4774, %4775, %4776, %4777) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4779 = "torch.aten.view"(%4772, %4778) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4779, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4780 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4781 = "torch.aten.floor_divide.Scalar"(%arg64, %4780) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4782 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4783 = "torch.aten.unsqueeze"(%4781, %4782) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4784 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4785 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4786 = "torch.aten.gather"(%arg65, %4784, %4783, %4785) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4787 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4788 = "torch.aten.remainder.Scalar"(%arg64, %4787) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %4789 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4790 = "torch.aten.unsqueeze"(%4788, %4789) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4791 = "torch.constant.none"() : () -> !torch.none
    %4792 = "torch.aten.clone"(%402, %4791) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %4793 = "torch.aten.detach"(%4792) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4794 = "torch.aten.detach"(%4793) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4795 = "torch.aten.detach"(%4794) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %4796 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4797 = "torch.aten.unsqueeze"(%4795, %4796) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %4798 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4799 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4800 = "torch.prim.ListConstruct"(%4798, %4799) : (!torch.int, !torch.int) -> !torch.list<int>
    %4801 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4802 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4803 = "torch.prim.ListConstruct"(%4801, %4802) : (!torch.int, !torch.int) -> !torch.list<int>
    %4804 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4805 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4806 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %4807 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4808 = "torch.aten.empty_strided"(%4800, %4803, %4804, %4805, %4806, %4807) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %4809 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %4810 = "torch.aten.fill.Scalar"(%4808, %4809) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %4811 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4812 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4813 = "torch.prim.ListConstruct"(%4811, %4812) : (!torch.int, !torch.int) -> !torch.list<int>
    %4814 = "torch.aten.repeat"(%4797, %4813) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %4815 = "torch.prim.ListConstruct"(%4786, %4810, %4814, %4790) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %4816 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4817 = "torch.aten.index_put"(%4779, %4815, %4676, %4816) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4817, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4818 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %4819 = "torch.prim.ListConstruct"(%1483, %4818) : (!torch.int, !torch.int) -> !torch.list<int>
    %4820 = "torch.aten.view"(%4817, %4819) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4820, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %4821 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %4822 = "torch.aten.mul.Scalar"(%arg65, %4821) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4822, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4823 = "torch.constant.int"() <{value = 12 : i64}> : () -> !torch.int
    %4824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4825 = "torch.aten.add.Scalar"(%4822, %4823, %4824) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4825, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4826 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4827 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4828 = "torch.aten.add.Scalar"(%4825, %4826, %4827) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4828, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4829 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %4830 = "torch.aten.view"(%4828, %4829) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%4830, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %4831 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4832 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4833 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4834 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4835 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4836 = "torch.prim.ListConstruct"(%1483, %4831, %4832, %4833, %4834, %4835) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4837 = "torch.aten.view"(%4820, %4836) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4837, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4838 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4839 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4840 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4841 = "torch.prim.ListConstruct"(%1914, %4838, %4839, %4840) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4842 = "torch.aten.view"(%4837, %4841) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4842, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4843 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4844 = "torch.aten.index_select"(%4842, %4843, %4830) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4844, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4845 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4846 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4847 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4848 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4849 = "torch.prim.ListConstruct"(%4845, %1481, %4846, %4847, %4848) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4850 = "torch.aten.view"(%4844, %4849) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4850, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4851 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4852 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4853 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4854 = "torch.prim.ListConstruct"(%4851, %1485, %4852, %4853) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4855 = "torch.aten.view"(%4850, %4854) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4855, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4857 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4858 = "torch.aten.add.Scalar"(%4825, %4856, %4857) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%4858, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %4859 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %4860 = "torch.aten.view"(%4858, %4859) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%4860, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %4861 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4862 = "torch.aten.index_select"(%4842, %4861, %4860) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4862, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4863 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4864 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4865 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4866 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4867 = "torch.prim.ListConstruct"(%4863, %1481, %4864, %4865, %4866) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4868 = "torch.aten.view"(%4862, %4867) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4868, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4869 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4870 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4871 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4872 = "torch.prim.ListConstruct"(%4869, %1485, %4870, %4871) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4873 = "torch.aten.view"(%4868, %4872) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4873, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4874 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4875 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4876 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4877 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4878 = "torch.aten.slice.Tensor"(%4855, %4874, %4875, %4876, %4877) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4878, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4879 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4880 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4881 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %4882 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4883 = "torch.aten.slice.Tensor"(%4873, %4879, %4880, %4881, %4882) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4883, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4884 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %4885 = "torch.aten.unsqueeze"(%4878, %4884) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4885, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4886 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4887 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4888 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4889 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4890 = "torch.prim.ListConstruct"(%4886, %1485, %4887, %4888, %4889) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4891 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4892 = "torch.aten.expand"(%4885, %4890, %4891) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4892, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4894 = "torch.aten.clone"(%4892, %4893) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4894, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4895 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4896 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4897 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4898 = "torch.prim.ListConstruct"(%4895, %1485, %4896, %4897) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4899 = "torch.aten._unsafe_view"(%4894, %4898) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4899, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4900 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %4901 = "torch.aten.unsqueeze"(%4883, %4900) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4901, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4902 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4903 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %4904 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4905 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4906 = "torch.prim.ListConstruct"(%4902, %1485, %4903, %4904, %4905) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4907 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4908 = "torch.aten.expand"(%4901, %4906, %4907) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4908, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4909 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4910 = "torch.aten.clone"(%4908, %4909) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4910, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4911 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4912 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %4913 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %4914 = "torch.prim.ListConstruct"(%4911, %1485, %4912, %4913) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4915 = "torch.aten._unsafe_view"(%4910, %4914) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4915, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4916 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4917 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4918 = "torch.aten.transpose.int"(%4700, %4916, %4917) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %4919 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4920 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4921 = "torch.aten.transpose.int"(%4899, %4919, %4920) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4921, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4922 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4923 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4924 = "torch.aten.transpose.int"(%4915, %4922, %4923) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4924, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %4925 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4926 = "torch.aten.squeeze.dim"(%1516, %4925) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4926, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %4927 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4928 = "torch.aten.squeeze.dim"(%4926, %4927) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%4928, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %4929 = "torch_c.to_builtin_tensor"(%4918) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %4930 = "tensor.cast"(%4929) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %4931 = "torch_c.to_builtin_tensor"(%4921) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %4932 = "torch_c.to_builtin_tensor"(%4924) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %4933 = "torch_c.to_builtin_tensor"(%4928) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %4934 = "tensor.cast"(%4933) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %4935 = "torch_c.to_builtin_tensor"(%404) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %4936 = "util.call"(%4930, %4931, %4932, %4935, %4934) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %4937 = "tensor.cast"(%4936) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %4938 = "torch_c.from_builtin_tensor"(%4937) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %4939 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4940 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4941 = "torch.aten.transpose.int"(%4938, %4939, %4940) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %4942 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4943 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4944 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4945 = "torch.prim.ListConstruct"(%4942, %4943, %4944) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4946 = "torch.aten.view"(%4941, %4945) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %4947 = "torch.aten.div.Tensor"(%4946, %406) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4948 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4949 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4950 = "torch.aten.clamp"(%4947, %4948, %4949) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %4951 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4952 = "torch.prims.convert_element_type"(%4950, %4951) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4953 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4954 = "torch.aten.unsqueeze"(%408, %4953) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %4955 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4956 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4957 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4958 = "torch.prim.ListConstruct"(%4955, %4956, %4957) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4959 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %4960 = "torch.aten.expand"(%4954, %4958, %4959) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %4961 = "torch_c.to_builtin_tensor"(%4952) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %4962 = "torch_c.to_builtin_tensor"(%4960) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %4963 = "util.call"(%4961, %4962) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %4964 = "torch_c.from_builtin_tensor"(%4963) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %4965 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4966 = "torch.prims.convert_element_type"(%4964, %4965) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4967 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4968 = "torch.aten.add.Tensor"(%4567, %4966, %4967) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4969 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %4970 = "torch.prims.convert_element_type"(%4968, %4969) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4971 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %4972 = "torch.aten.pow.Tensor_Scalar"(%4970, %4971) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %4973 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %4974 = "torch.prim.ListConstruct"(%4973) : (!torch.int) -> !torch.list<int>
    %4975 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %4976 = "torch.constant.none"() : () -> !torch.none
    %4977 = "torch.aten.mean.dim"(%4972, %4974, %4975, %4976) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %4978 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %4979 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %4980 = "torch.aten.add.Scalar"(%4977, %4978, %4979) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %4981 = "torch.aten.rsqrt"(%4980) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %4982 = "torch.aten.mul.Tensor"(%4970, %4981) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %4983 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4984 = "torch.prims.convert_element_type"(%4982, %4983) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4985 = "torch.aten.mul.Tensor"(%410, %4984) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %4986 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %4987 = "torch.prims.convert_element_type"(%4985, %4986) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %4988 = "torch.aten.div.Tensor"(%4987, %412) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %4989 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %4990 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %4991 = "torch.aten.clamp"(%4988, %4989, %4990) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %4992 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %4993 = "torch.prims.convert_element_type"(%4991, %4992) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %4994 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %4995 = "torch.aten.unsqueeze"(%414, %4994) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %4996 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %4997 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %4998 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %4999 = "torch.prim.ListConstruct"(%4996, %4997, %4998) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5000 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5001 = "torch.aten.expand"(%4995, %4999, %5000) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %5002 = "torch_c.to_builtin_tensor"(%4993) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5003 = "torch_c.to_builtin_tensor"(%5001) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %5004 = "util.call"(%5002, %5003) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %5005 = "torch_c.from_builtin_tensor"(%5004) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %5006 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5007 = "torch.prims.convert_element_type"(%5005, %5006) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %5008 = "torch.aten.silu"(%5007) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %5009 = "torch.aten.div.Tensor"(%4987, %416) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5010 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5011 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5012 = "torch.aten.clamp"(%5009, %5010, %5011) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5013 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5014 = "torch.prims.convert_element_type"(%5012, %5013) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5015 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5016 = "torch.aten.unsqueeze"(%418, %5015) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %5017 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5018 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %5019 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5020 = "torch.prim.ListConstruct"(%5017, %5018, %5019) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5021 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5022 = "torch.aten.expand"(%5016, %5020, %5021) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %5023 = "torch_c.to_builtin_tensor"(%5014) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5024 = "torch_c.to_builtin_tensor"(%5022) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %5025 = "util.call"(%5023, %5024) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %5026 = "torch_c.from_builtin_tensor"(%5025) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %5027 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5028 = "torch.prims.convert_element_type"(%5026, %5027) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %5029 = "torch.aten.mul.Tensor"(%5008, %5028) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %5030 = "torch.aten.div.Tensor"(%5029, %420) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %5031 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5032 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5033 = "torch.aten.clamp"(%5030, %5031, %5032) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %5034 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5035 = "torch.prims.convert_element_type"(%5033, %5034) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %5036 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5037 = "torch.aten.unsqueeze"(%422, %5036) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %5038 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5039 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5040 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %5041 = "torch.prim.ListConstruct"(%5038, %5039, %5040) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5042 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5043 = "torch.aten.expand"(%5037, %5041, %5042) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %5044 = "torch_c.to_builtin_tensor"(%5035) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %5045 = "torch_c.to_builtin_tensor"(%5043) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %5046 = "util.call"(%5044, %5045) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %5047 = "torch_c.from_builtin_tensor"(%5046) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %5048 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5049 = "torch.prims.convert_element_type"(%5047, %5048) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5050 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5051 = "torch.aten.add.Tensor"(%4968, %5049, %5050) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5052 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %5053 = "torch.prims.convert_element_type"(%5051, %5052) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5054 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5055 = "torch.aten.pow.Tensor_Scalar"(%5053, %5054) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5056 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5057 = "torch.prim.ListConstruct"(%5056) : (!torch.int) -> !torch.list<int>
    %5058 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %5059 = "torch.constant.none"() : () -> !torch.none
    %5060 = "torch.aten.mean.dim"(%5055, %5057, %5058, %5059) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %5061 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %5062 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5063 = "torch.aten.add.Scalar"(%5060, %5061, %5062) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %5064 = "torch.aten.rsqrt"(%5063) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %5065 = "torch.aten.mul.Tensor"(%5053, %5064) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5066 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5067 = "torch.prims.convert_element_type"(%5065, %5066) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5068 = "torch.aten.mul.Tensor"(%424, %5067) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %5069 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5070 = "torch.prims.convert_element_type"(%5068, %5069) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5071 = "torch.aten.div.Tensor"(%5070, %426) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5072 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5073 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5074 = "torch.aten.clamp"(%5071, %5072, %5073) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5075 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5076 = "torch.prims.convert_element_type"(%5074, %5075) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5077 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5078 = "torch.aten.unsqueeze"(%428, %5077) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %5079 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5080 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5081 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5082 = "torch.prim.ListConstruct"(%5079, %5080, %5081) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5083 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5084 = "torch.aten.expand"(%5078, %5082, %5083) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %5085 = "torch_c.to_builtin_tensor"(%5076) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5086 = "torch_c.to_builtin_tensor"(%5084) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %5087 = "util.call"(%5085, %5086) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %5088 = "torch_c.from_builtin_tensor"(%5087) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %5089 = "torch.aten.div.Tensor"(%5088, %430) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5090 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5091 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5092 = "torch.aten.clamp"(%5089, %5090, %5091) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %5093 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5094 = "torch.prims.convert_element_type"(%5092, %5093) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5095 = "torch.aten.div.Tensor"(%5070, %432) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5096 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5097 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5098 = "torch.aten.clamp"(%5095, %5096, %5097) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5099 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5100 = "torch.prims.convert_element_type"(%5098, %5099) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5101 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5102 = "torch.aten.unsqueeze"(%434, %5101) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %5103 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5104 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %5105 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5106 = "torch.prim.ListConstruct"(%5103, %5104, %5105) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5107 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5108 = "torch.aten.expand"(%5102, %5106, %5107) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %5109 = "torch_c.to_builtin_tensor"(%5100) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5110 = "torch_c.to_builtin_tensor"(%5108) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %5111 = "util.call"(%5109, %5110) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %5112 = "torch_c.from_builtin_tensor"(%5111) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %5113 = "torch.aten.div.Tensor"(%5112, %436) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %5114 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5115 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5116 = "torch.aten.clamp"(%5113, %5114, %5115) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %5117 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5118 = "torch.prims.convert_element_type"(%5116, %5117) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %5119 = "torch.aten.div.Tensor"(%5070, %438) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5120 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5121 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5122 = "torch.aten.clamp"(%5119, %5120, %5121) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5123 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5124 = "torch.prims.convert_element_type"(%5122, %5123) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5125 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5126 = "torch.aten.unsqueeze"(%440, %5125) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %5127 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5128 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %5129 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5130 = "torch.prim.ListConstruct"(%5127, %5128, %5129) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5131 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5132 = "torch.aten.expand"(%5126, %5130, %5131) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %5133 = "torch_c.to_builtin_tensor"(%5124) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5134 = "torch_c.to_builtin_tensor"(%5132) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %5135 = "util.call"(%5133, %5134) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %5136 = "torch_c.from_builtin_tensor"(%5135) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %5137 = "torch.aten.div.Tensor"(%5136, %442) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %5138 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5139 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5140 = "torch.aten.clamp"(%5137, %5138, %5139) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %5141 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5142 = "torch.prims.convert_element_type"(%5140, %5141) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %5143 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5144 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5145 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5146 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5147 = "torch.prim.ListConstruct"(%5143, %5144, %5145, %5146) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5148 = "torch.aten.view"(%5094, %5147) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %5149 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5150 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5151 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5152 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5153 = "torch.prim.ListConstruct"(%5149, %5150, %5151, %5152) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5154 = "torch.aten.view"(%5118, %5153) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %5155 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5156 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5157 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5158 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5159 = "torch.prim.ListConstruct"(%5155, %5156, %5157, %5158) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5160 = "torch.aten.view"(%5142, %5159) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %5161 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5162 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5163 = "torch.aten.transpose.int"(%5148, %5161, %5162) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5164 = "torch.aten.mul.Tensor"(%5163, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5165 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5166 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5167 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5168 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5169 = "torch.aten.slice.Tensor"(%5163, %5165, %5166, %5167, %5168) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %5170 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5171 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5172 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5173 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5174 = "torch.aten.slice.Tensor"(%5163, %5170, %5171, %5172, %5173) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %5175 = "torch.aten.neg"(%5174) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %5176 = "torch.prim.ListConstruct"(%5175, %5169) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %5177 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5178 = "torch.aten.cat"(%5176, %5177) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5179 = "torch.aten.mul.Tensor"(%5178, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5180 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5181 = "torch.aten.add.Tensor"(%5164, %5179, %5180) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5183 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5184 = "torch.aten.transpose.int"(%5181, %5182, %5183) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %5185 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5186 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5187 = "torch.aten.transpose.int"(%5154, %5185, %5186) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5188 = "torch.aten.mul.Tensor"(%5187, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5189 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5190 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5191 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5193 = "torch.aten.slice.Tensor"(%5187, %5189, %5190, %5191, %5192) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %5194 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5195 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5196 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5197 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5198 = "torch.aten.slice.Tensor"(%5187, %5194, %5195, %5196, %5197) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %5199 = "torch.aten.neg"(%5198) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %5200 = "torch.prim.ListConstruct"(%5199, %5193) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %5201 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5202 = "torch.aten.cat"(%5200, %5201) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5203 = "torch.aten.mul.Tensor"(%5202, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5204 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5205 = "torch.aten.add.Tensor"(%5188, %5203, %5204) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5206 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5207 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5208 = "torch.aten.transpose.int"(%5205, %5206, %5207) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %5209 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5210 = "torch.aten.floor_divide.Scalar"(%arg64, %5209) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5211 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5212 = "torch.aten.unsqueeze"(%5210, %5211) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5213 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5214 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5215 = "torch.aten.gather"(%arg65, %5213, %5212, %5214) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5216 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5217 = "torch.aten.remainder.Scalar"(%arg64, %5216) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5218 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5219 = "torch.aten.unsqueeze"(%5217, %5218) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5220 = "torch.constant.none"() : () -> !torch.none
    %5221 = "torch.aten.clone"(%443, %5220) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %5222 = "torch.aten.detach"(%5221) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5223 = "torch.aten.detach"(%5222) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5224 = "torch.aten.detach"(%5223) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5225 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5226 = "torch.aten.unsqueeze"(%5224, %5225) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %5227 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5228 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5229 = "torch.prim.ListConstruct"(%5227, %5228) : (!torch.int, !torch.int) -> !torch.list<int>
    %5230 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5232 = "torch.prim.ListConstruct"(%5230, %5231) : (!torch.int, !torch.int) -> !torch.list<int>
    %5233 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5234 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5235 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %5236 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5237 = "torch.aten.empty_strided"(%5229, %5232, %5233, %5234, %5235, %5236) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5238 = "torch.constant.int"() <{value = 7 : i64}> : () -> !torch.int
    %5239 = "torch.aten.fill.Scalar"(%5237, %5238) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5240 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5241 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5242 = "torch.prim.ListConstruct"(%5240, %5241) : (!torch.int, !torch.int) -> !torch.list<int>
    %5243 = "torch.aten.repeat"(%5226, %5242) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %5244 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5245 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5246 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5247 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5248 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5249 = "torch.prim.ListConstruct"(%1483, %5244, %5245, %5246, %5247, %5248) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5250 = "torch.aten.view"(%4820, %5249) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5250, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5251 = "torch.prim.ListConstruct"(%5215, %5239, %5243, %5219) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %5252 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5253 = "torch.aten.index_put"(%5250, %5251, %5208, %5252) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5253, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5254 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %5255 = "torch.prim.ListConstruct"(%1483, %5254) : (!torch.int, !torch.int) -> !torch.list<int>
    %5256 = "torch.aten.view"(%5253, %5255) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5256, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %5257 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5258 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5259 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5260 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5261 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5262 = "torch.prim.ListConstruct"(%1483, %5257, %5258, %5259, %5260, %5261) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5263 = "torch.aten.view"(%5256, %5262) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5263, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5264 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5265 = "torch.aten.floor_divide.Scalar"(%arg64, %5264) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5266 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5267 = "torch.aten.unsqueeze"(%5265, %5266) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5268 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5269 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5270 = "torch.aten.gather"(%arg65, %5268, %5267, %5269) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5271 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5272 = "torch.aten.remainder.Scalar"(%arg64, %5271) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5273 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5274 = "torch.aten.unsqueeze"(%5272, %5273) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5275 = "torch.constant.none"() : () -> !torch.none
    %5276 = "torch.aten.clone"(%444, %5275) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %5277 = "torch.aten.detach"(%5276) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5278 = "torch.aten.detach"(%5277) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5279 = "torch.aten.detach"(%5278) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5280 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5281 = "torch.aten.unsqueeze"(%5279, %5280) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %5282 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5283 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5284 = "torch.prim.ListConstruct"(%5282, %5283) : (!torch.int, !torch.int) -> !torch.list<int>
    %5285 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5286 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5287 = "torch.prim.ListConstruct"(%5285, %5286) : (!torch.int, !torch.int) -> !torch.list<int>
    %5288 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5289 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5290 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %5291 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5292 = "torch.aten.empty_strided"(%5284, %5287, %5288, %5289, %5290, %5291) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5293 = "torch.constant.int"() <{value = 7 : i64}> : () -> !torch.int
    %5294 = "torch.aten.fill.Scalar"(%5292, %5293) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5295 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5296 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5297 = "torch.prim.ListConstruct"(%5295, %5296) : (!torch.int, !torch.int) -> !torch.list<int>
    %5298 = "torch.aten.repeat"(%5281, %5297) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %5299 = "torch.prim.ListConstruct"(%5270, %5294, %5298, %5274) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %5300 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5301 = "torch.aten.index_put"(%5263, %5299, %5160, %5300) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5301, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5302 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %5303 = "torch.prim.ListConstruct"(%1483, %5302) : (!torch.int, !torch.int) -> !torch.list<int>
    %5304 = "torch.aten.view"(%5301, %5303) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5304, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %5305 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5306 = "torch.aten.mul.Scalar"(%arg65, %5305) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5306, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5307 = "torch.constant.int"() <{value = 14 : i64}> : () -> !torch.int
    %5308 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5309 = "torch.aten.add.Scalar"(%5306, %5307, %5308) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5309, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5310 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5311 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5312 = "torch.aten.add.Scalar"(%5309, %5310, %5311) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5312, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5313 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %5314 = "torch.aten.view"(%5312, %5313) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%5314, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %5315 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5316 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5317 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5318 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5319 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5320 = "torch.prim.ListConstruct"(%1483, %5315, %5316, %5317, %5318, %5319) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5321 = "torch.aten.view"(%5304, %5320) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5321, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5322 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5323 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5324 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5325 = "torch.prim.ListConstruct"(%1914, %5322, %5323, %5324) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5326 = "torch.aten.view"(%5321, %5325) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5326, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5327 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5328 = "torch.aten.index_select"(%5326, %5327, %5314) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5328, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5329 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5330 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5331 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5332 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5333 = "torch.prim.ListConstruct"(%5329, %1481, %5330, %5331, %5332) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5334 = "torch.aten.view"(%5328, %5333) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5334, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5335 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5336 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5337 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5338 = "torch.prim.ListConstruct"(%5335, %1485, %5336, %5337) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5339 = "torch.aten.view"(%5334, %5338) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5339, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5340 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5341 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5342 = "torch.aten.add.Scalar"(%5309, %5340, %5341) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5342, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5343 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %5344 = "torch.aten.view"(%5342, %5343) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%5344, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %5345 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5346 = "torch.aten.index_select"(%5326, %5345, %5344) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5346, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5347 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5348 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5349 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5350 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5351 = "torch.prim.ListConstruct"(%5347, %1481, %5348, %5349, %5350) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5352 = "torch.aten.view"(%5346, %5351) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5352, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5353 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5354 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5355 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5356 = "torch.prim.ListConstruct"(%5353, %1485, %5354, %5355) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5357 = "torch.aten.view"(%5352, %5356) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5357, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5358 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5359 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5360 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5361 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5362 = "torch.aten.slice.Tensor"(%5339, %5358, %5359, %5360, %5361) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5362, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5363 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5364 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5365 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5366 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5367 = "torch.aten.slice.Tensor"(%5357, %5363, %5364, %5365, %5366) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5367, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5368 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %5369 = "torch.aten.unsqueeze"(%5362, %5368) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5369, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5370 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5371 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5372 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5373 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5374 = "torch.prim.ListConstruct"(%5370, %1485, %5371, %5372, %5373) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5375 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5376 = "torch.aten.expand"(%5369, %5374, %5375) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5376, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5377 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5378 = "torch.aten.clone"(%5376, %5377) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5378, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5379 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5380 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5381 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5382 = "torch.prim.ListConstruct"(%5379, %1485, %5380, %5381) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5383 = "torch.aten._unsafe_view"(%5378, %5382) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5383, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5384 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %5385 = "torch.aten.unsqueeze"(%5367, %5384) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5385, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5386 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5387 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5388 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5389 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5390 = "torch.prim.ListConstruct"(%5386, %1485, %5387, %5388, %5389) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5391 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5392 = "torch.aten.expand"(%5385, %5390, %5391) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5392, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5393 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5394 = "torch.aten.clone"(%5392, %5393) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5394, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5395 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5396 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5397 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5398 = "torch.prim.ListConstruct"(%5395, %1485, %5396, %5397) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5399 = "torch.aten._unsafe_view"(%5394, %5398) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5399, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5400 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5401 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5402 = "torch.aten.transpose.int"(%5184, %5400, %5401) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5403 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5404 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5405 = "torch.aten.transpose.int"(%5383, %5403, %5404) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5405, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5406 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5407 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5408 = "torch.aten.transpose.int"(%5399, %5406, %5407) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5408, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5409 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5410 = "torch.aten.squeeze.dim"(%1516, %5409) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5410, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %5411 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5412 = "torch.aten.squeeze.dim"(%5410, %5411) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5412, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %5413 = "torch_c.to_builtin_tensor"(%5402) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %5414 = "tensor.cast"(%5413) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %5415 = "torch_c.to_builtin_tensor"(%5405) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %5416 = "torch_c.to_builtin_tensor"(%5408) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %5417 = "torch_c.to_builtin_tensor"(%5412) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %5418 = "tensor.cast"(%5417) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %5419 = "torch_c.to_builtin_tensor"(%446) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %5420 = "util.call"(%5414, %5415, %5416, %5419, %5418) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %5421 = "tensor.cast"(%5420) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %5422 = "torch_c.from_builtin_tensor"(%5421) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %5423 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5424 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5425 = "torch.aten.transpose.int"(%5422, %5423, %5424) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %5426 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5427 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5428 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5429 = "torch.prim.ListConstruct"(%5426, %5427, %5428) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5430 = "torch.aten.view"(%5425, %5429) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %5431 = "torch.aten.div.Tensor"(%5430, %448) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5432 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5433 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5434 = "torch.aten.clamp"(%5431, %5432, %5433) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %5435 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5436 = "torch.prims.convert_element_type"(%5434, %5435) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5437 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5438 = "torch.aten.unsqueeze"(%450, %5437) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %5439 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5440 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5441 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5442 = "torch.prim.ListConstruct"(%5439, %5440, %5441) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5443 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5444 = "torch.aten.expand"(%5438, %5442, %5443) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %5445 = "torch_c.to_builtin_tensor"(%5436) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5446 = "torch_c.to_builtin_tensor"(%5444) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %5447 = "util.call"(%5445, %5446) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %5448 = "torch_c.from_builtin_tensor"(%5447) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %5449 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5450 = "torch.prims.convert_element_type"(%5448, %5449) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5451 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5452 = "torch.aten.add.Tensor"(%5051, %5450, %5451) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5453 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %5454 = "torch.prims.convert_element_type"(%5452, %5453) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5455 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5456 = "torch.aten.pow.Tensor_Scalar"(%5454, %5455) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5457 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5458 = "torch.prim.ListConstruct"(%5457) : (!torch.int) -> !torch.list<int>
    %5459 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %5460 = "torch.constant.none"() : () -> !torch.none
    %5461 = "torch.aten.mean.dim"(%5456, %5458, %5459, %5460) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %5462 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %5463 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5464 = "torch.aten.add.Scalar"(%5461, %5462, %5463) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %5465 = "torch.aten.rsqrt"(%5464) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %5466 = "torch.aten.mul.Tensor"(%5454, %5465) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5467 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5468 = "torch.prims.convert_element_type"(%5466, %5467) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5469 = "torch.aten.mul.Tensor"(%452, %5468) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %5470 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5471 = "torch.prims.convert_element_type"(%5469, %5470) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5472 = "torch.aten.div.Tensor"(%5471, %454) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5473 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5474 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5475 = "torch.aten.clamp"(%5472, %5473, %5474) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5476 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5477 = "torch.prims.convert_element_type"(%5475, %5476) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5478 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5479 = "torch.aten.unsqueeze"(%456, %5478) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %5480 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5481 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %5482 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5483 = "torch.prim.ListConstruct"(%5480, %5481, %5482) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5484 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5485 = "torch.aten.expand"(%5479, %5483, %5484) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %5486 = "torch_c.to_builtin_tensor"(%5477) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5487 = "torch_c.to_builtin_tensor"(%5485) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %5488 = "util.call"(%5486, %5487) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %5489 = "torch_c.from_builtin_tensor"(%5488) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %5490 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5491 = "torch.prims.convert_element_type"(%5489, %5490) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %5492 = "torch.aten.silu"(%5491) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %5493 = "torch.aten.div.Tensor"(%5471, %458) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5494 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5495 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5496 = "torch.aten.clamp"(%5493, %5494, %5495) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5497 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5498 = "torch.prims.convert_element_type"(%5496, %5497) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5499 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5500 = "torch.aten.unsqueeze"(%460, %5499) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %5501 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5502 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %5503 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5504 = "torch.prim.ListConstruct"(%5501, %5502, %5503) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5505 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5506 = "torch.aten.expand"(%5500, %5504, %5505) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %5507 = "torch_c.to_builtin_tensor"(%5498) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5508 = "torch_c.to_builtin_tensor"(%5506) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %5509 = "util.call"(%5507, %5508) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %5510 = "torch_c.from_builtin_tensor"(%5509) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %5511 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5512 = "torch.prims.convert_element_type"(%5510, %5511) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %5513 = "torch.aten.mul.Tensor"(%5492, %5512) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %5514 = "torch.aten.div.Tensor"(%5513, %462) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %5515 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5516 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5517 = "torch.aten.clamp"(%5514, %5515, %5516) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %5518 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5519 = "torch.prims.convert_element_type"(%5517, %5518) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %5520 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5521 = "torch.aten.unsqueeze"(%464, %5520) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %5522 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5523 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5524 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %5525 = "torch.prim.ListConstruct"(%5522, %5523, %5524) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5526 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5527 = "torch.aten.expand"(%5521, %5525, %5526) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %5528 = "torch_c.to_builtin_tensor"(%5519) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %5529 = "torch_c.to_builtin_tensor"(%5527) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %5530 = "util.call"(%5528, %5529) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %5531 = "torch_c.from_builtin_tensor"(%5530) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %5532 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5533 = "torch.prims.convert_element_type"(%5531, %5532) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5534 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5535 = "torch.aten.add.Tensor"(%5452, %5533, %5534) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5536 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %5537 = "torch.prims.convert_element_type"(%5535, %5536) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5538 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5539 = "torch.aten.pow.Tensor_Scalar"(%5537, %5538) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5540 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5541 = "torch.prim.ListConstruct"(%5540) : (!torch.int) -> !torch.list<int>
    %5542 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %5543 = "torch.constant.none"() : () -> !torch.none
    %5544 = "torch.aten.mean.dim"(%5539, %5541, %5542, %5543) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %5545 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %5546 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5547 = "torch.aten.add.Scalar"(%5544, %5545, %5546) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %5548 = "torch.aten.rsqrt"(%5547) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %5549 = "torch.aten.mul.Tensor"(%5537, %5548) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5550 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5551 = "torch.prims.convert_element_type"(%5549, %5550) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5552 = "torch.aten.mul.Tensor"(%466, %5551) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %5553 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5554 = "torch.prims.convert_element_type"(%5552, %5553) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5555 = "torch.aten.div.Tensor"(%5554, %468) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5556 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5557 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5558 = "torch.aten.clamp"(%5555, %5556, %5557) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5559 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5560 = "torch.prims.convert_element_type"(%5558, %5559) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5561 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5562 = "torch.aten.unsqueeze"(%470, %5561) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %5563 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5564 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5565 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5566 = "torch.prim.ListConstruct"(%5563, %5564, %5565) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5567 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5568 = "torch.aten.expand"(%5562, %5566, %5567) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %5569 = "torch_c.to_builtin_tensor"(%5560) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5570 = "torch_c.to_builtin_tensor"(%5568) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %5571 = "util.call"(%5569, %5570) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %5572 = "torch_c.from_builtin_tensor"(%5571) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %5573 = "torch.aten.div.Tensor"(%5572, %472) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5574 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5575 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5576 = "torch.aten.clamp"(%5573, %5574, %5575) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %5577 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5578 = "torch.prims.convert_element_type"(%5576, %5577) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5579 = "torch.aten.div.Tensor"(%5554, %474) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5580 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5581 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5582 = "torch.aten.clamp"(%5579, %5580, %5581) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5583 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5584 = "torch.prims.convert_element_type"(%5582, %5583) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5585 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5586 = "torch.aten.unsqueeze"(%476, %5585) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %5587 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5588 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %5589 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5590 = "torch.prim.ListConstruct"(%5587, %5588, %5589) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5591 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5592 = "torch.aten.expand"(%5586, %5590, %5591) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %5593 = "torch_c.to_builtin_tensor"(%5584) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5594 = "torch_c.to_builtin_tensor"(%5592) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %5595 = "util.call"(%5593, %5594) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %5596 = "torch_c.from_builtin_tensor"(%5595) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %5597 = "torch.aten.div.Tensor"(%5596, %478) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %5598 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5599 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5600 = "torch.aten.clamp"(%5597, %5598, %5599) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %5601 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5602 = "torch.prims.convert_element_type"(%5600, %5601) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %5603 = "torch.aten.div.Tensor"(%5554, %480) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5604 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5605 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5606 = "torch.aten.clamp"(%5603, %5604, %5605) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5607 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5608 = "torch.prims.convert_element_type"(%5606, %5607) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5609 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5610 = "torch.aten.unsqueeze"(%482, %5609) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %5611 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5612 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %5613 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5614 = "torch.prim.ListConstruct"(%5611, %5612, %5613) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5615 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5616 = "torch.aten.expand"(%5610, %5614, %5615) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %5617 = "torch_c.to_builtin_tensor"(%5608) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5618 = "torch_c.to_builtin_tensor"(%5616) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %5619 = "util.call"(%5617, %5618) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %5620 = "torch_c.from_builtin_tensor"(%5619) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %5621 = "torch.aten.div.Tensor"(%5620, %484) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %5622 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5623 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5624 = "torch.aten.clamp"(%5621, %5622, %5623) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %5625 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5626 = "torch.prims.convert_element_type"(%5624, %5625) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %5627 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5628 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5629 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5630 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5631 = "torch.prim.ListConstruct"(%5627, %5628, %5629, %5630) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5632 = "torch.aten.view"(%5578, %5631) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %5633 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5634 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5635 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5636 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5637 = "torch.prim.ListConstruct"(%5633, %5634, %5635, %5636) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5638 = "torch.aten.view"(%5602, %5637) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %5639 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5640 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5641 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5642 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5643 = "torch.prim.ListConstruct"(%5639, %5640, %5641, %5642) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5644 = "torch.aten.view"(%5626, %5643) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %5645 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5646 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5647 = "torch.aten.transpose.int"(%5632, %5645, %5646) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5648 = "torch.aten.mul.Tensor"(%5647, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5649 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5650 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5651 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5652 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5653 = "torch.aten.slice.Tensor"(%5647, %5649, %5650, %5651, %5652) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %5654 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5655 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5656 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5657 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5658 = "torch.aten.slice.Tensor"(%5647, %5654, %5655, %5656, %5657) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %5659 = "torch.aten.neg"(%5658) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %5660 = "torch.prim.ListConstruct"(%5659, %5653) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %5661 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5662 = "torch.aten.cat"(%5660, %5661) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5663 = "torch.aten.mul.Tensor"(%5662, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5664 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5665 = "torch.aten.add.Tensor"(%5648, %5663, %5664) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5667 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5668 = "torch.aten.transpose.int"(%5665, %5666, %5667) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %5669 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5670 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5671 = "torch.aten.transpose.int"(%5638, %5669, %5670) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5672 = "torch.aten.mul.Tensor"(%5671, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5673 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5674 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5675 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5676 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5677 = "torch.aten.slice.Tensor"(%5671, %5673, %5674, %5675, %5676) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %5678 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %5679 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5680 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5681 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5682 = "torch.aten.slice.Tensor"(%5671, %5678, %5679, %5680, %5681) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %5683 = "torch.aten.neg"(%5682) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %5684 = "torch.prim.ListConstruct"(%5683, %5677) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %5685 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5686 = "torch.aten.cat"(%5684, %5685) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5687 = "torch.aten.mul.Tensor"(%5686, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5688 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5689 = "torch.aten.add.Tensor"(%5672, %5687, %5688) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %5690 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5691 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5692 = "torch.aten.transpose.int"(%5689, %5690, %5691) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %5693 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5694 = "torch.aten.floor_divide.Scalar"(%arg64, %5693) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5695 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5696 = "torch.aten.unsqueeze"(%5694, %5695) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5697 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5698 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5699 = "torch.aten.gather"(%arg65, %5697, %5696, %5698) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5700 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5701 = "torch.aten.remainder.Scalar"(%arg64, %5700) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5702 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5703 = "torch.aten.unsqueeze"(%5701, %5702) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5704 = "torch.constant.none"() : () -> !torch.none
    %5705 = "torch.aten.clone"(%485, %5704) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %5706 = "torch.aten.detach"(%5705) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5707 = "torch.aten.detach"(%5706) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5708 = "torch.aten.detach"(%5707) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5709 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5710 = "torch.aten.unsqueeze"(%5708, %5709) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %5711 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5712 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5713 = "torch.prim.ListConstruct"(%5711, %5712) : (!torch.int, !torch.int) -> !torch.list<int>
    %5714 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5715 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5716 = "torch.prim.ListConstruct"(%5714, %5715) : (!torch.int, !torch.int) -> !torch.list<int>
    %5717 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5718 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5719 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %5720 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5721 = "torch.aten.empty_strided"(%5713, %5716, %5717, %5718, %5719, %5720) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5722 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5723 = "torch.aten.fill.Scalar"(%5721, %5722) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5724 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5725 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5726 = "torch.prim.ListConstruct"(%5724, %5725) : (!torch.int, !torch.int) -> !torch.list<int>
    %5727 = "torch.aten.repeat"(%5710, %5726) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %5728 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5729 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5730 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5731 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5732 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5733 = "torch.prim.ListConstruct"(%1483, %5728, %5729, %5730, %5731, %5732) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5734 = "torch.aten.view"(%5304, %5733) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5734, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5735 = "torch.prim.ListConstruct"(%5699, %5723, %5727, %5703) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %5736 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5737 = "torch.aten.index_put"(%5734, %5735, %5692, %5736) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5737, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5738 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %5739 = "torch.prim.ListConstruct"(%1483, %5738) : (!torch.int, !torch.int) -> !torch.list<int>
    %5740 = "torch.aten.view"(%5737, %5739) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5740, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %5741 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5742 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5743 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5744 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5745 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5746 = "torch.prim.ListConstruct"(%1483, %5741, %5742, %5743, %5744, %5745) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5747 = "torch.aten.view"(%5740, %5746) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5747, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5748 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5749 = "torch.aten.floor_divide.Scalar"(%arg64, %5748) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5751 = "torch.aten.unsqueeze"(%5749, %5750) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5752 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5753 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5754 = "torch.aten.gather"(%arg65, %5752, %5751, %5753) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5755 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5756 = "torch.aten.remainder.Scalar"(%arg64, %5755) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %5757 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5758 = "torch.aten.unsqueeze"(%5756, %5757) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5759 = "torch.constant.none"() : () -> !torch.none
    %5760 = "torch.aten.clone"(%486, %5759) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %5761 = "torch.aten.detach"(%5760) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5762 = "torch.aten.detach"(%5761) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5763 = "torch.aten.detach"(%5762) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %5764 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5765 = "torch.aten.unsqueeze"(%5763, %5764) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %5766 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5767 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5768 = "torch.prim.ListConstruct"(%5766, %5767) : (!torch.int, !torch.int) -> !torch.list<int>
    %5769 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5770 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5771 = "torch.prim.ListConstruct"(%5769, %5770) : (!torch.int, !torch.int) -> !torch.list<int>
    %5772 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5773 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5774 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %5775 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5776 = "torch.aten.empty_strided"(%5768, %5771, %5772, %5773, %5774, %5775) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %5777 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5778 = "torch.aten.fill.Scalar"(%5776, %5777) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %5779 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5780 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5781 = "torch.prim.ListConstruct"(%5779, %5780) : (!torch.int, !torch.int) -> !torch.list<int>
    %5782 = "torch.aten.repeat"(%5765, %5781) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %5783 = "torch.prim.ListConstruct"(%5754, %5778, %5782, %5758) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %5784 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5785 = "torch.aten.index_put"(%5747, %5783, %5644, %5784) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5785, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5786 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %5787 = "torch.prim.ListConstruct"(%1483, %5786) : (!torch.int, !torch.int) -> !torch.list<int>
    %5788 = "torch.aten.view"(%5785, %5787) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5788, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %5789 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %5790 = "torch.aten.mul.Scalar"(%arg65, %5789) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5790, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5791 = "torch.constant.int"() <{value = 16 : i64}> : () -> !torch.int
    %5792 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5793 = "torch.aten.add.Scalar"(%5790, %5791, %5792) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5793, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5794 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5795 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5796 = "torch.aten.add.Scalar"(%5793, %5794, %5795) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5796, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5797 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %5798 = "torch.aten.view"(%5796, %5797) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%5798, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %5799 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5800 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5801 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5802 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5803 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5804 = "torch.prim.ListConstruct"(%1483, %5799, %5800, %5801, %5802, %5803) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5805 = "torch.aten.view"(%5788, %5804) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5805, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5806 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5807 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5808 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5809 = "torch.prim.ListConstruct"(%1914, %5806, %5807, %5808) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5810 = "torch.aten.view"(%5805, %5809) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5810, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5811 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5812 = "torch.aten.index_select"(%5810, %5811, %5798) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5812, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5813 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5814 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5815 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5816 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5817 = "torch.prim.ListConstruct"(%5813, %1481, %5814, %5815, %5816) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5818 = "torch.aten.view"(%5812, %5817) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5818, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5819 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5820 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5821 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5822 = "torch.prim.ListConstruct"(%5819, %1485, %5820, %5821) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5823 = "torch.aten.view"(%5818, %5822) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5823, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5825 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5826 = "torch.aten.add.Scalar"(%5793, %5824, %5825) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%5826, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %5827 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %5828 = "torch.aten.view"(%5826, %5827) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%5828, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %5829 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5830 = "torch.aten.index_select"(%5810, %5829, %5828) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5830, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5831 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5832 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5833 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5834 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5835 = "torch.prim.ListConstruct"(%5831, %1481, %5832, %5833, %5834) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5836 = "torch.aten.view"(%5830, %5835) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5836, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5837 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5838 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5839 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5840 = "torch.prim.ListConstruct"(%5837, %1485, %5838, %5839) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5841 = "torch.aten.view"(%5836, %5840) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5841, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5842 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5843 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5844 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5845 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5846 = "torch.aten.slice.Tensor"(%5823, %5842, %5843, %5844, %5845) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5846, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5847 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5848 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5849 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %5850 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5851 = "torch.aten.slice.Tensor"(%5841, %5847, %5848, %5849, %5850) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5851, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5852 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %5853 = "torch.aten.unsqueeze"(%5846, %5852) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5853, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5854 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5855 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5856 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5857 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5858 = "torch.prim.ListConstruct"(%5854, %1485, %5855, %5856, %5857) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5859 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5860 = "torch.aten.expand"(%5853, %5858, %5859) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5860, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5861 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5862 = "torch.aten.clone"(%5860, %5861) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5862, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5863 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5864 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5865 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5866 = "torch.prim.ListConstruct"(%5863, %1485, %5864, %5865) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5867 = "torch.aten._unsafe_view"(%5862, %5866) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5867, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5868 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %5869 = "torch.aten.unsqueeze"(%5851, %5868) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5869, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5870 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5871 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %5872 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5873 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5874 = "torch.prim.ListConstruct"(%5870, %1485, %5871, %5872, %5873) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5875 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5876 = "torch.aten.expand"(%5869, %5874, %5875) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5876, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5877 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5878 = "torch.aten.clone"(%5876, %5877) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5878, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5879 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5880 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %5881 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %5882 = "torch.prim.ListConstruct"(%5879, %1485, %5880, %5881) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5883 = "torch.aten._unsafe_view"(%5878, %5882) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5883, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5884 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5885 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5886 = "torch.aten.transpose.int"(%5668, %5884, %5885) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %5887 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5888 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5889 = "torch.aten.transpose.int"(%5867, %5887, %5888) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5889, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5890 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5891 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5892 = "torch.aten.transpose.int"(%5883, %5890, %5891) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5892, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %5893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5894 = "torch.aten.squeeze.dim"(%1516, %5893) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5894, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %5895 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5896 = "torch.aten.squeeze.dim"(%5894, %5895) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%5896, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %5897 = "torch_c.to_builtin_tensor"(%5886) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %5898 = "tensor.cast"(%5897) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %5899 = "torch_c.to_builtin_tensor"(%5889) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %5900 = "torch_c.to_builtin_tensor"(%5892) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %5901 = "torch_c.to_builtin_tensor"(%5896) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %5902 = "tensor.cast"(%5901) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %5903 = "torch_c.to_builtin_tensor"(%488) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %5904 = "util.call"(%5898, %5899, %5900, %5903, %5902) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %5905 = "tensor.cast"(%5904) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %5906 = "torch_c.from_builtin_tensor"(%5905) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %5907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5908 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5909 = "torch.aten.transpose.int"(%5906, %5907, %5908) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %5910 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5911 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5912 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5913 = "torch.prim.ListConstruct"(%5910, %5911, %5912) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5914 = "torch.aten.view"(%5909, %5913) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %5915 = "torch.aten.div.Tensor"(%5914, %490) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5916 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5917 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5918 = "torch.aten.clamp"(%5915, %5916, %5917) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %5919 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5920 = "torch.prims.convert_element_type"(%5918, %5919) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5921 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5922 = "torch.aten.unsqueeze"(%492, %5921) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %5923 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5924 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5925 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5926 = "torch.prim.ListConstruct"(%5923, %5924, %5925) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5927 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5928 = "torch.aten.expand"(%5922, %5926, %5927) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %5929 = "torch_c.to_builtin_tensor"(%5920) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5930 = "torch_c.to_builtin_tensor"(%5928) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %5931 = "util.call"(%5929, %5930) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %5932 = "torch_c.from_builtin_tensor"(%5931) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %5933 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5934 = "torch.prims.convert_element_type"(%5932, %5933) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5935 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5936 = "torch.aten.add.Tensor"(%5535, %5934, %5935) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5937 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %5938 = "torch.prims.convert_element_type"(%5936, %5937) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5939 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %5940 = "torch.aten.pow.Tensor_Scalar"(%5938, %5939) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %5941 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %5942 = "torch.prim.ListConstruct"(%5941) : (!torch.int) -> !torch.list<int>
    %5943 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %5944 = "torch.constant.none"() : () -> !torch.none
    %5945 = "torch.aten.mean.dim"(%5940, %5942, %5943, %5944) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %5946 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %5947 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %5948 = "torch.aten.add.Scalar"(%5945, %5946, %5947) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %5949 = "torch.aten.rsqrt"(%5948) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %5950 = "torch.aten.mul.Tensor"(%5938, %5949) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %5951 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5952 = "torch.prims.convert_element_type"(%5950, %5951) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5953 = "torch.aten.mul.Tensor"(%494, %5952) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %5954 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5955 = "torch.prims.convert_element_type"(%5953, %5954) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %5956 = "torch.aten.div.Tensor"(%5955, %496) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5957 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5958 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5959 = "torch.aten.clamp"(%5956, %5957, %5958) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5960 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5961 = "torch.prims.convert_element_type"(%5959, %5960) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5962 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5963 = "torch.aten.unsqueeze"(%498, %5962) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %5964 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5965 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %5966 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5967 = "torch.prim.ListConstruct"(%5964, %5965, %5966) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5968 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5969 = "torch.aten.expand"(%5963, %5967, %5968) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %5970 = "torch_c.to_builtin_tensor"(%5961) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5971 = "torch_c.to_builtin_tensor"(%5969) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %5972 = "util.call"(%5970, %5971) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %5973 = "torch_c.from_builtin_tensor"(%5972) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %5974 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5975 = "torch.prims.convert_element_type"(%5973, %5974) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %5976 = "torch.aten.silu"(%5975) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %5977 = "torch.aten.div.Tensor"(%5955, %500) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %5978 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %5979 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %5980 = "torch.aten.clamp"(%5977, %5978, %5979) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %5981 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %5982 = "torch.prims.convert_element_type"(%5980, %5981) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %5983 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %5984 = "torch.aten.unsqueeze"(%502, %5983) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %5985 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %5986 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %5987 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %5988 = "torch.prim.ListConstruct"(%5985, %5986, %5987) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5989 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %5990 = "torch.aten.expand"(%5984, %5988, %5989) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %5991 = "torch_c.to_builtin_tensor"(%5982) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %5992 = "torch_c.to_builtin_tensor"(%5990) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %5993 = "util.call"(%5991, %5992) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %5994 = "torch_c.from_builtin_tensor"(%5993) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %5995 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %5996 = "torch.prims.convert_element_type"(%5994, %5995) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %5997 = "torch.aten.mul.Tensor"(%5976, %5996) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %5998 = "torch.aten.div.Tensor"(%5997, %504) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %5999 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6000 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6001 = "torch.aten.clamp"(%5998, %5999, %6000) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %6002 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6003 = "torch.prims.convert_element_type"(%6001, %6002) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %6004 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6005 = "torch.aten.unsqueeze"(%506, %6004) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %6006 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6007 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6008 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %6009 = "torch.prim.ListConstruct"(%6006, %6007, %6008) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6010 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6011 = "torch.aten.expand"(%6005, %6009, %6010) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %6012 = "torch_c.to_builtin_tensor"(%6003) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %6013 = "torch_c.to_builtin_tensor"(%6011) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %6014 = "util.call"(%6012, %6013) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %6015 = "torch_c.from_builtin_tensor"(%6014) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %6016 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6017 = "torch.prims.convert_element_type"(%6015, %6016) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6018 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6019 = "torch.aten.add.Tensor"(%5936, %6017, %6018) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6020 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %6021 = "torch.prims.convert_element_type"(%6019, %6020) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6022 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6023 = "torch.aten.pow.Tensor_Scalar"(%6021, %6022) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6024 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6025 = "torch.prim.ListConstruct"(%6024) : (!torch.int) -> !torch.list<int>
    %6026 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %6027 = "torch.constant.none"() : () -> !torch.none
    %6028 = "torch.aten.mean.dim"(%6023, %6025, %6026, %6027) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %6029 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %6030 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6031 = "torch.aten.add.Scalar"(%6028, %6029, %6030) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %6032 = "torch.aten.rsqrt"(%6031) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %6033 = "torch.aten.mul.Tensor"(%6021, %6032) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6034 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6035 = "torch.prims.convert_element_type"(%6033, %6034) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6036 = "torch.aten.mul.Tensor"(%508, %6035) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %6037 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6038 = "torch.prims.convert_element_type"(%6036, %6037) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6039 = "torch.aten.div.Tensor"(%6038, %510) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6040 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6041 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6042 = "torch.aten.clamp"(%6039, %6040, %6041) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6043 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6044 = "torch.prims.convert_element_type"(%6042, %6043) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6045 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6046 = "torch.aten.unsqueeze"(%512, %6045) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %6047 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6048 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6049 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6050 = "torch.prim.ListConstruct"(%6047, %6048, %6049) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6051 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6052 = "torch.aten.expand"(%6046, %6050, %6051) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %6053 = "torch_c.to_builtin_tensor"(%6044) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6054 = "torch_c.to_builtin_tensor"(%6052) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %6055 = "util.call"(%6053, %6054) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %6056 = "torch_c.from_builtin_tensor"(%6055) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %6057 = "torch.aten.div.Tensor"(%6056, %514) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6058 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6059 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6060 = "torch.aten.clamp"(%6057, %6058, %6059) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %6061 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6062 = "torch.prims.convert_element_type"(%6060, %6061) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6063 = "torch.aten.div.Tensor"(%6038, %516) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6064 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6065 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6066 = "torch.aten.clamp"(%6063, %6064, %6065) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6067 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6068 = "torch.prims.convert_element_type"(%6066, %6067) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6069 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6070 = "torch.aten.unsqueeze"(%518, %6069) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %6071 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6072 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %6073 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6074 = "torch.prim.ListConstruct"(%6071, %6072, %6073) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6075 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6076 = "torch.aten.expand"(%6070, %6074, %6075) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %6077 = "torch_c.to_builtin_tensor"(%6068) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6078 = "torch_c.to_builtin_tensor"(%6076) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %6079 = "util.call"(%6077, %6078) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %6080 = "torch_c.from_builtin_tensor"(%6079) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %6081 = "torch.aten.div.Tensor"(%6080, %520) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %6082 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6083 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6084 = "torch.aten.clamp"(%6081, %6082, %6083) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %6085 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6086 = "torch.prims.convert_element_type"(%6084, %6085) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %6087 = "torch.aten.div.Tensor"(%6038, %522) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6088 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6089 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6090 = "torch.aten.clamp"(%6087, %6088, %6089) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6091 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6092 = "torch.prims.convert_element_type"(%6090, %6091) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6093 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6094 = "torch.aten.unsqueeze"(%524, %6093) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %6095 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6096 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %6097 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6098 = "torch.prim.ListConstruct"(%6095, %6096, %6097) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6099 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6100 = "torch.aten.expand"(%6094, %6098, %6099) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %6101 = "torch_c.to_builtin_tensor"(%6092) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6102 = "torch_c.to_builtin_tensor"(%6100) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %6103 = "util.call"(%6101, %6102) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %6104 = "torch_c.from_builtin_tensor"(%6103) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %6105 = "torch.aten.div.Tensor"(%6104, %526) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %6106 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6107 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6108 = "torch.aten.clamp"(%6105, %6106, %6107) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %6109 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6110 = "torch.prims.convert_element_type"(%6108, %6109) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %6111 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6112 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6113 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6114 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6115 = "torch.prim.ListConstruct"(%6111, %6112, %6113, %6114) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6116 = "torch.aten.view"(%6062, %6115) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %6117 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6118 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6119 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6120 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6121 = "torch.prim.ListConstruct"(%6117, %6118, %6119, %6120) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6122 = "torch.aten.view"(%6086, %6121) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %6123 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6124 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6125 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6126 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6127 = "torch.prim.ListConstruct"(%6123, %6124, %6125, %6126) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6128 = "torch.aten.view"(%6110, %6127) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %6129 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6130 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6131 = "torch.aten.transpose.int"(%6116, %6129, %6130) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6132 = "torch.aten.mul.Tensor"(%6131, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6133 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6134 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6135 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6136 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6137 = "torch.aten.slice.Tensor"(%6131, %6133, %6134, %6135, %6136) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %6138 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6139 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6140 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6141 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6142 = "torch.aten.slice.Tensor"(%6131, %6138, %6139, %6140, %6141) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %6143 = "torch.aten.neg"(%6142) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %6144 = "torch.prim.ListConstruct"(%6143, %6137) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %6145 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6146 = "torch.aten.cat"(%6144, %6145) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6147 = "torch.aten.mul.Tensor"(%6146, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6148 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6149 = "torch.aten.add.Tensor"(%6132, %6147, %6148) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6150 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6151 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6152 = "torch.aten.transpose.int"(%6149, %6150, %6151) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %6153 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6154 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6155 = "torch.aten.transpose.int"(%6122, %6153, %6154) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6156 = "torch.aten.mul.Tensor"(%6155, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6157 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6158 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6159 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6160 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6161 = "torch.aten.slice.Tensor"(%6155, %6157, %6158, %6159, %6160) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %6162 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6163 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6164 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6165 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6166 = "torch.aten.slice.Tensor"(%6155, %6162, %6163, %6164, %6165) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %6167 = "torch.aten.neg"(%6166) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %6168 = "torch.prim.ListConstruct"(%6167, %6161) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %6169 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6170 = "torch.aten.cat"(%6168, %6169) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6171 = "torch.aten.mul.Tensor"(%6170, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6172 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6173 = "torch.aten.add.Tensor"(%6156, %6171, %6172) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6174 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6175 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6176 = "torch.aten.transpose.int"(%6173, %6174, %6175) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %6177 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6178 = "torch.aten.floor_divide.Scalar"(%arg64, %6177) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6179 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6180 = "torch.aten.unsqueeze"(%6178, %6179) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6181 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6182 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6183 = "torch.aten.gather"(%arg65, %6181, %6180, %6182) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6184 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6185 = "torch.aten.remainder.Scalar"(%arg64, %6184) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6186 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6187 = "torch.aten.unsqueeze"(%6185, %6186) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6188 = "torch.constant.none"() : () -> !torch.none
    %6189 = "torch.aten.clone"(%527, %6188) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %6190 = "torch.aten.detach"(%6189) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6191 = "torch.aten.detach"(%6190) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6192 = "torch.aten.detach"(%6191) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6193 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6194 = "torch.aten.unsqueeze"(%6192, %6193) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %6195 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6196 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6197 = "torch.prim.ListConstruct"(%6195, %6196) : (!torch.int, !torch.int) -> !torch.list<int>
    %6198 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6199 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6200 = "torch.prim.ListConstruct"(%6198, %6199) : (!torch.int, !torch.int) -> !torch.list<int>
    %6201 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6202 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6203 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %6204 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6205 = "torch.aten.empty_strided"(%6197, %6200, %6201, %6202, %6203, %6204) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6206 = "torch.constant.int"() <{value = 9 : i64}> : () -> !torch.int
    %6207 = "torch.aten.fill.Scalar"(%6205, %6206) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6208 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6210 = "torch.prim.ListConstruct"(%6208, %6209) : (!torch.int, !torch.int) -> !torch.list<int>
    %6211 = "torch.aten.repeat"(%6194, %6210) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %6212 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6213 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6214 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6215 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6216 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6217 = "torch.prim.ListConstruct"(%1483, %6212, %6213, %6214, %6215, %6216) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6218 = "torch.aten.view"(%5788, %6217) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6218, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6219 = "torch.prim.ListConstruct"(%6183, %6207, %6211, %6187) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %6220 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6221 = "torch.aten.index_put"(%6218, %6219, %6176, %6220) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6221, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6222 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %6223 = "torch.prim.ListConstruct"(%1483, %6222) : (!torch.int, !torch.int) -> !torch.list<int>
    %6224 = "torch.aten.view"(%6221, %6223) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6224, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %6225 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6226 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6227 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6228 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6229 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6230 = "torch.prim.ListConstruct"(%1483, %6225, %6226, %6227, %6228, %6229) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6231 = "torch.aten.view"(%6224, %6230) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6231, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6232 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6233 = "torch.aten.floor_divide.Scalar"(%arg64, %6232) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6234 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6235 = "torch.aten.unsqueeze"(%6233, %6234) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6236 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6237 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6238 = "torch.aten.gather"(%arg65, %6236, %6235, %6237) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6239 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6240 = "torch.aten.remainder.Scalar"(%arg64, %6239) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6241 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6242 = "torch.aten.unsqueeze"(%6240, %6241) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6243 = "torch.constant.none"() : () -> !torch.none
    %6244 = "torch.aten.clone"(%528, %6243) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %6245 = "torch.aten.detach"(%6244) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6246 = "torch.aten.detach"(%6245) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6247 = "torch.aten.detach"(%6246) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6248 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6249 = "torch.aten.unsqueeze"(%6247, %6248) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %6250 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6251 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6252 = "torch.prim.ListConstruct"(%6250, %6251) : (!torch.int, !torch.int) -> !torch.list<int>
    %6253 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6254 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6255 = "torch.prim.ListConstruct"(%6253, %6254) : (!torch.int, !torch.int) -> !torch.list<int>
    %6256 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6257 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6258 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %6259 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6260 = "torch.aten.empty_strided"(%6252, %6255, %6256, %6257, %6258, %6259) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6261 = "torch.constant.int"() <{value = 9 : i64}> : () -> !torch.int
    %6262 = "torch.aten.fill.Scalar"(%6260, %6261) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6263 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6264 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6265 = "torch.prim.ListConstruct"(%6263, %6264) : (!torch.int, !torch.int) -> !torch.list<int>
    %6266 = "torch.aten.repeat"(%6249, %6265) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %6267 = "torch.prim.ListConstruct"(%6238, %6262, %6266, %6242) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %6268 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6269 = "torch.aten.index_put"(%6231, %6267, %6128, %6268) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6269, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6270 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %6271 = "torch.prim.ListConstruct"(%1483, %6270) : (!torch.int, !torch.int) -> !torch.list<int>
    %6272 = "torch.aten.view"(%6269, %6271) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6272, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %6273 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6274 = "torch.aten.mul.Scalar"(%arg65, %6273) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6274, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6275 = "torch.constant.int"() <{value = 18 : i64}> : () -> !torch.int
    %6276 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6277 = "torch.aten.add.Scalar"(%6274, %6275, %6276) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6277, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6278 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6279 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6280 = "torch.aten.add.Scalar"(%6277, %6278, %6279) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6280, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6281 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %6282 = "torch.aten.view"(%6280, %6281) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%6282, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %6283 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6284 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6285 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6286 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6287 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6288 = "torch.prim.ListConstruct"(%1483, %6283, %6284, %6285, %6286, %6287) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6289 = "torch.aten.view"(%6272, %6288) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6289, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6290 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6291 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6292 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6293 = "torch.prim.ListConstruct"(%1914, %6290, %6291, %6292) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6294 = "torch.aten.view"(%6289, %6293) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6294, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6295 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6296 = "torch.aten.index_select"(%6294, %6295, %6282) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6296, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6297 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6298 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6299 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6300 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6301 = "torch.prim.ListConstruct"(%6297, %1481, %6298, %6299, %6300) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6302 = "torch.aten.view"(%6296, %6301) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6302, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6303 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6304 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6305 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6306 = "torch.prim.ListConstruct"(%6303, %1485, %6304, %6305) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6307 = "torch.aten.view"(%6302, %6306) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6307, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6308 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6309 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6310 = "torch.aten.add.Scalar"(%6277, %6308, %6309) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6310, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6311 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %6312 = "torch.aten.view"(%6310, %6311) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%6312, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %6313 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6314 = "torch.aten.index_select"(%6294, %6313, %6312) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6314, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6315 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6316 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6317 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6318 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6319 = "torch.prim.ListConstruct"(%6315, %1481, %6316, %6317, %6318) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6320 = "torch.aten.view"(%6314, %6319) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6320, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6321 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6322 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6323 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6324 = "torch.prim.ListConstruct"(%6321, %1485, %6322, %6323) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6325 = "torch.aten.view"(%6320, %6324) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6325, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6326 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6327 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6328 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6329 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6330 = "torch.aten.slice.Tensor"(%6307, %6326, %6327, %6328, %6329) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6330, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6331 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6332 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6333 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6334 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6335 = "torch.aten.slice.Tensor"(%6325, %6331, %6332, %6333, %6334) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6335, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6336 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %6337 = "torch.aten.unsqueeze"(%6330, %6336) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6337, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6338 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6339 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6340 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6341 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6342 = "torch.prim.ListConstruct"(%6338, %1485, %6339, %6340, %6341) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6343 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6344 = "torch.aten.expand"(%6337, %6342, %6343) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6344, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6345 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6346 = "torch.aten.clone"(%6344, %6345) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6346, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6347 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6348 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6349 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6350 = "torch.prim.ListConstruct"(%6347, %1485, %6348, %6349) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6351 = "torch.aten._unsafe_view"(%6346, %6350) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6351, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6352 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %6353 = "torch.aten.unsqueeze"(%6335, %6352) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6353, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6354 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6355 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6356 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6357 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6358 = "torch.prim.ListConstruct"(%6354, %1485, %6355, %6356, %6357) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6359 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6360 = "torch.aten.expand"(%6353, %6358, %6359) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6360, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6361 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6362 = "torch.aten.clone"(%6360, %6361) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6362, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6363 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6364 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6365 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6366 = "torch.prim.ListConstruct"(%6363, %1485, %6364, %6365) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6367 = "torch.aten._unsafe_view"(%6362, %6366) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6367, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6368 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6369 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6370 = "torch.aten.transpose.int"(%6152, %6368, %6369) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6371 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6372 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6373 = "torch.aten.transpose.int"(%6351, %6371, %6372) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6373, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6374 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6375 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6376 = "torch.aten.transpose.int"(%6367, %6374, %6375) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6376, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6377 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6378 = "torch.aten.squeeze.dim"(%1516, %6377) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6378, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %6379 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6380 = "torch.aten.squeeze.dim"(%6378, %6379) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6380, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %6381 = "torch_c.to_builtin_tensor"(%6370) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %6382 = "tensor.cast"(%6381) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %6383 = "torch_c.to_builtin_tensor"(%6373) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %6384 = "torch_c.to_builtin_tensor"(%6376) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %6385 = "torch_c.to_builtin_tensor"(%6380) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %6386 = "tensor.cast"(%6385) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %6387 = "torch_c.to_builtin_tensor"(%530) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %6388 = "util.call"(%6382, %6383, %6384, %6387, %6386) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %6389 = "tensor.cast"(%6388) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %6390 = "torch_c.from_builtin_tensor"(%6389) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %6391 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6392 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6393 = "torch.aten.transpose.int"(%6390, %6391, %6392) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %6394 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6395 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6396 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6397 = "torch.prim.ListConstruct"(%6394, %6395, %6396) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6398 = "torch.aten.view"(%6393, %6397) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %6399 = "torch.aten.div.Tensor"(%6398, %532) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6400 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6401 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6402 = "torch.aten.clamp"(%6399, %6400, %6401) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %6403 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6404 = "torch.prims.convert_element_type"(%6402, %6403) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6405 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6406 = "torch.aten.unsqueeze"(%534, %6405) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %6407 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6408 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6409 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6410 = "torch.prim.ListConstruct"(%6407, %6408, %6409) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6411 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6412 = "torch.aten.expand"(%6406, %6410, %6411) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %6413 = "torch_c.to_builtin_tensor"(%6404) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6414 = "torch_c.to_builtin_tensor"(%6412) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %6415 = "util.call"(%6413, %6414) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %6416 = "torch_c.from_builtin_tensor"(%6415) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %6417 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6418 = "torch.prims.convert_element_type"(%6416, %6417) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6419 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6420 = "torch.aten.add.Tensor"(%6019, %6418, %6419) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6421 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %6422 = "torch.prims.convert_element_type"(%6420, %6421) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6423 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6424 = "torch.aten.pow.Tensor_Scalar"(%6422, %6423) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6425 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6426 = "torch.prim.ListConstruct"(%6425) : (!torch.int) -> !torch.list<int>
    %6427 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %6428 = "torch.constant.none"() : () -> !torch.none
    %6429 = "torch.aten.mean.dim"(%6424, %6426, %6427, %6428) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %6430 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %6431 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6432 = "torch.aten.add.Scalar"(%6429, %6430, %6431) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %6433 = "torch.aten.rsqrt"(%6432) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %6434 = "torch.aten.mul.Tensor"(%6422, %6433) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6435 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6436 = "torch.prims.convert_element_type"(%6434, %6435) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6437 = "torch.aten.mul.Tensor"(%536, %6436) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %6438 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6439 = "torch.prims.convert_element_type"(%6437, %6438) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6440 = "torch.aten.div.Tensor"(%6439, %538) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6441 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6442 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6443 = "torch.aten.clamp"(%6440, %6441, %6442) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6444 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6445 = "torch.prims.convert_element_type"(%6443, %6444) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6446 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6447 = "torch.aten.unsqueeze"(%540, %6446) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %6448 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6449 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %6450 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6451 = "torch.prim.ListConstruct"(%6448, %6449, %6450) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6452 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6453 = "torch.aten.expand"(%6447, %6451, %6452) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %6454 = "torch_c.to_builtin_tensor"(%6445) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6455 = "torch_c.to_builtin_tensor"(%6453) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %6456 = "util.call"(%6454, %6455) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %6457 = "torch_c.from_builtin_tensor"(%6456) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %6458 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6459 = "torch.prims.convert_element_type"(%6457, %6458) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %6460 = "torch.aten.silu"(%6459) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %6461 = "torch.aten.div.Tensor"(%6439, %542) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6462 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6463 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6464 = "torch.aten.clamp"(%6461, %6462, %6463) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6465 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6466 = "torch.prims.convert_element_type"(%6464, %6465) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6467 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6468 = "torch.aten.unsqueeze"(%544, %6467) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %6469 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6470 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %6471 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6472 = "torch.prim.ListConstruct"(%6469, %6470, %6471) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6473 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6474 = "torch.aten.expand"(%6468, %6472, %6473) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %6475 = "torch_c.to_builtin_tensor"(%6466) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6476 = "torch_c.to_builtin_tensor"(%6474) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %6477 = "util.call"(%6475, %6476) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %6478 = "torch_c.from_builtin_tensor"(%6477) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %6479 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6480 = "torch.prims.convert_element_type"(%6478, %6479) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %6481 = "torch.aten.mul.Tensor"(%6460, %6480) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %6482 = "torch.aten.div.Tensor"(%6481, %546) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %6483 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6484 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6485 = "torch.aten.clamp"(%6482, %6483, %6484) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %6486 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6487 = "torch.prims.convert_element_type"(%6485, %6486) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %6488 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6489 = "torch.aten.unsqueeze"(%548, %6488) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %6490 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6491 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6492 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %6493 = "torch.prim.ListConstruct"(%6490, %6491, %6492) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6494 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6495 = "torch.aten.expand"(%6489, %6493, %6494) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %6496 = "torch_c.to_builtin_tensor"(%6487) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %6497 = "torch_c.to_builtin_tensor"(%6495) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %6498 = "util.call"(%6496, %6497) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %6499 = "torch_c.from_builtin_tensor"(%6498) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %6500 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6501 = "torch.prims.convert_element_type"(%6499, %6500) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6502 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6503 = "torch.aten.add.Tensor"(%6420, %6501, %6502) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6504 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %6505 = "torch.prims.convert_element_type"(%6503, %6504) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6506 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6507 = "torch.aten.pow.Tensor_Scalar"(%6505, %6506) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6508 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6509 = "torch.prim.ListConstruct"(%6508) : (!torch.int) -> !torch.list<int>
    %6510 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %6511 = "torch.constant.none"() : () -> !torch.none
    %6512 = "torch.aten.mean.dim"(%6507, %6509, %6510, %6511) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %6513 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %6514 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6515 = "torch.aten.add.Scalar"(%6512, %6513, %6514) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %6516 = "torch.aten.rsqrt"(%6515) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %6517 = "torch.aten.mul.Tensor"(%6505, %6516) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6518 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6519 = "torch.prims.convert_element_type"(%6517, %6518) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6520 = "torch.aten.mul.Tensor"(%550, %6519) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %6521 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6522 = "torch.prims.convert_element_type"(%6520, %6521) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6523 = "torch.aten.div.Tensor"(%6522, %552) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6524 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6525 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6526 = "torch.aten.clamp"(%6523, %6524, %6525) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6527 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6528 = "torch.prims.convert_element_type"(%6526, %6527) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6529 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6530 = "torch.aten.unsqueeze"(%554, %6529) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %6531 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6532 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6533 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6534 = "torch.prim.ListConstruct"(%6531, %6532, %6533) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6535 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6536 = "torch.aten.expand"(%6530, %6534, %6535) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %6537 = "torch_c.to_builtin_tensor"(%6528) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6538 = "torch_c.to_builtin_tensor"(%6536) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %6539 = "util.call"(%6537, %6538) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %6540 = "torch_c.from_builtin_tensor"(%6539) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %6541 = "torch.aten.div.Tensor"(%6540, %556) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6542 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6543 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6544 = "torch.aten.clamp"(%6541, %6542, %6543) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %6545 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6546 = "torch.prims.convert_element_type"(%6544, %6545) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6547 = "torch.aten.div.Tensor"(%6522, %558) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6548 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6549 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6550 = "torch.aten.clamp"(%6547, %6548, %6549) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6551 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6552 = "torch.prims.convert_element_type"(%6550, %6551) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6553 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6554 = "torch.aten.unsqueeze"(%560, %6553) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %6555 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6556 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %6557 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6558 = "torch.prim.ListConstruct"(%6555, %6556, %6557) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6559 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6560 = "torch.aten.expand"(%6554, %6558, %6559) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %6561 = "torch_c.to_builtin_tensor"(%6552) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6562 = "torch_c.to_builtin_tensor"(%6560) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %6563 = "util.call"(%6561, %6562) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %6564 = "torch_c.from_builtin_tensor"(%6563) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %6565 = "torch.aten.div.Tensor"(%6564, %562) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %6566 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6567 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6568 = "torch.aten.clamp"(%6565, %6566, %6567) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %6569 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6570 = "torch.prims.convert_element_type"(%6568, %6569) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %6571 = "torch.aten.div.Tensor"(%6522, %564) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6572 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6573 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6574 = "torch.aten.clamp"(%6571, %6572, %6573) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6575 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6576 = "torch.prims.convert_element_type"(%6574, %6575) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6577 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6578 = "torch.aten.unsqueeze"(%566, %6577) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %6579 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6580 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %6581 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6582 = "torch.prim.ListConstruct"(%6579, %6580, %6581) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6583 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6584 = "torch.aten.expand"(%6578, %6582, %6583) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %6585 = "torch_c.to_builtin_tensor"(%6576) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6586 = "torch_c.to_builtin_tensor"(%6584) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %6587 = "util.call"(%6585, %6586) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %6588 = "torch_c.from_builtin_tensor"(%6587) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %6589 = "torch.aten.div.Tensor"(%6588, %568) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %6590 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6591 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6592 = "torch.aten.clamp"(%6589, %6590, %6591) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %6593 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6594 = "torch.prims.convert_element_type"(%6592, %6593) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %6595 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6596 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6597 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6598 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6599 = "torch.prim.ListConstruct"(%6595, %6596, %6597, %6598) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6600 = "torch.aten.view"(%6546, %6599) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %6601 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6602 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6603 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6604 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6605 = "torch.prim.ListConstruct"(%6601, %6602, %6603, %6604) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6606 = "torch.aten.view"(%6570, %6605) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %6607 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6608 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6609 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6610 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6611 = "torch.prim.ListConstruct"(%6607, %6608, %6609, %6610) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6612 = "torch.aten.view"(%6594, %6611) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %6613 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6614 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6615 = "torch.aten.transpose.int"(%6600, %6613, %6614) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6616 = "torch.aten.mul.Tensor"(%6615, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6617 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6618 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6619 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6621 = "torch.aten.slice.Tensor"(%6615, %6617, %6618, %6619, %6620) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %6622 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6623 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6624 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6625 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6626 = "torch.aten.slice.Tensor"(%6615, %6622, %6623, %6624, %6625) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %6627 = "torch.aten.neg"(%6626) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %6628 = "torch.prim.ListConstruct"(%6627, %6621) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %6629 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6630 = "torch.aten.cat"(%6628, %6629) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6631 = "torch.aten.mul.Tensor"(%6630, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6632 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6633 = "torch.aten.add.Tensor"(%6616, %6631, %6632) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6634 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6635 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6636 = "torch.aten.transpose.int"(%6633, %6634, %6635) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %6637 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6638 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6639 = "torch.aten.transpose.int"(%6606, %6637, %6638) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6640 = "torch.aten.mul.Tensor"(%6639, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6641 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6642 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6643 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6644 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6645 = "torch.aten.slice.Tensor"(%6639, %6641, %6642, %6643, %6644) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %6646 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %6647 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6648 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6649 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6650 = "torch.aten.slice.Tensor"(%6639, %6646, %6647, %6648, %6649) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %6651 = "torch.aten.neg"(%6650) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %6652 = "torch.prim.ListConstruct"(%6651, %6645) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %6653 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6654 = "torch.aten.cat"(%6652, %6653) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6655 = "torch.aten.mul.Tensor"(%6654, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6656 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6657 = "torch.aten.add.Tensor"(%6640, %6655, %6656) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %6658 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6659 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6660 = "torch.aten.transpose.int"(%6657, %6658, %6659) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %6661 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6662 = "torch.aten.floor_divide.Scalar"(%arg64, %6661) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6663 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6664 = "torch.aten.unsqueeze"(%6662, %6663) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6665 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6666 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6667 = "torch.aten.gather"(%arg65, %6665, %6664, %6666) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6668 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6669 = "torch.aten.remainder.Scalar"(%arg64, %6668) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6670 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6671 = "torch.aten.unsqueeze"(%6669, %6670) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6672 = "torch.constant.none"() : () -> !torch.none
    %6673 = "torch.aten.clone"(%569, %6672) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %6674 = "torch.aten.detach"(%6673) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6675 = "torch.aten.detach"(%6674) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6676 = "torch.aten.detach"(%6675) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6677 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6678 = "torch.aten.unsqueeze"(%6676, %6677) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %6679 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6680 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6681 = "torch.prim.ListConstruct"(%6679, %6680) : (!torch.int, !torch.int) -> !torch.list<int>
    %6682 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6683 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6684 = "torch.prim.ListConstruct"(%6682, %6683) : (!torch.int, !torch.int) -> !torch.list<int>
    %6685 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6686 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6687 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %6688 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6689 = "torch.aten.empty_strided"(%6681, %6684, %6685, %6686, %6687, %6688) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6690 = "torch.constant.int"() <{value = 10 : i64}> : () -> !torch.int
    %6691 = "torch.aten.fill.Scalar"(%6689, %6690) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6692 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6693 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6694 = "torch.prim.ListConstruct"(%6692, %6693) : (!torch.int, !torch.int) -> !torch.list<int>
    %6695 = "torch.aten.repeat"(%6678, %6694) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %6696 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6697 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6698 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6699 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6700 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6701 = "torch.prim.ListConstruct"(%1483, %6696, %6697, %6698, %6699, %6700) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6702 = "torch.aten.view"(%6272, %6701) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6702, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6703 = "torch.prim.ListConstruct"(%6667, %6691, %6695, %6671) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %6704 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6705 = "torch.aten.index_put"(%6702, %6703, %6660, %6704) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6705, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6706 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %6707 = "torch.prim.ListConstruct"(%1483, %6706) : (!torch.int, !torch.int) -> !torch.list<int>
    %6708 = "torch.aten.view"(%6705, %6707) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6708, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %6709 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6710 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6711 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6712 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6713 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6714 = "torch.prim.ListConstruct"(%1483, %6709, %6710, %6711, %6712, %6713) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6715 = "torch.aten.view"(%6708, %6714) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6715, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6716 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6717 = "torch.aten.floor_divide.Scalar"(%arg64, %6716) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6718 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6719 = "torch.aten.unsqueeze"(%6717, %6718) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6720 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6721 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6722 = "torch.aten.gather"(%arg65, %6720, %6719, %6721) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6723 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6724 = "torch.aten.remainder.Scalar"(%arg64, %6723) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %6725 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6726 = "torch.aten.unsqueeze"(%6724, %6725) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6727 = "torch.constant.none"() : () -> !torch.none
    %6728 = "torch.aten.clone"(%570, %6727) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %6729 = "torch.aten.detach"(%6728) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6730 = "torch.aten.detach"(%6729) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6731 = "torch.aten.detach"(%6730) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %6732 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6733 = "torch.aten.unsqueeze"(%6731, %6732) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %6734 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6735 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6736 = "torch.prim.ListConstruct"(%6734, %6735) : (!torch.int, !torch.int) -> !torch.list<int>
    %6737 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6738 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6739 = "torch.prim.ListConstruct"(%6737, %6738) : (!torch.int, !torch.int) -> !torch.list<int>
    %6740 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6741 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6742 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %6743 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6744 = "torch.aten.empty_strided"(%6736, %6739, %6740, %6741, %6742, %6743) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %6745 = "torch.constant.int"() <{value = 10 : i64}> : () -> !torch.int
    %6746 = "torch.aten.fill.Scalar"(%6744, %6745) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %6747 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6748 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6749 = "torch.prim.ListConstruct"(%6747, %6748) : (!torch.int, !torch.int) -> !torch.list<int>
    %6750 = "torch.aten.repeat"(%6733, %6749) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %6751 = "torch.prim.ListConstruct"(%6722, %6746, %6750, %6726) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %6752 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6753 = "torch.aten.index_put"(%6715, %6751, %6612, %6752) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6753, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6754 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %6755 = "torch.prim.ListConstruct"(%1483, %6754) : (!torch.int, !torch.int) -> !torch.list<int>
    %6756 = "torch.aten.view"(%6753, %6755) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6756, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %6757 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %6758 = "torch.aten.mul.Scalar"(%arg65, %6757) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6758, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6759 = "torch.constant.int"() <{value = 20 : i64}> : () -> !torch.int
    %6760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6761 = "torch.aten.add.Scalar"(%6758, %6759, %6760) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6761, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6762 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6763 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6764 = "torch.aten.add.Scalar"(%6761, %6762, %6763) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6764, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6765 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %6766 = "torch.aten.view"(%6764, %6765) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%6766, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %6767 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6768 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6769 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6770 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6771 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6772 = "torch.prim.ListConstruct"(%1483, %6767, %6768, %6769, %6770, %6771) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6773 = "torch.aten.view"(%6756, %6772) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6773, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6774 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6775 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6776 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6777 = "torch.prim.ListConstruct"(%1914, %6774, %6775, %6776) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6778 = "torch.aten.view"(%6773, %6777) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6778, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6779 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6780 = "torch.aten.index_select"(%6778, %6779, %6766) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6780, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6781 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6782 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6783 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6784 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6785 = "torch.prim.ListConstruct"(%6781, %1481, %6782, %6783, %6784) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6786 = "torch.aten.view"(%6780, %6785) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6786, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6787 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6788 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6789 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6790 = "torch.prim.ListConstruct"(%6787, %1485, %6788, %6789) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6791 = "torch.aten.view"(%6786, %6790) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6791, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6792 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6793 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6794 = "torch.aten.add.Scalar"(%6761, %6792, %6793) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%6794, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %6795 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %6796 = "torch.aten.view"(%6794, %6795) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%6796, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %6797 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6798 = "torch.aten.index_select"(%6778, %6797, %6796) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6798, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6799 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6800 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6801 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6802 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6803 = "torch.prim.ListConstruct"(%6799, %1481, %6800, %6801, %6802) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6804 = "torch.aten.view"(%6798, %6803) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6804, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6805 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6806 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6807 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6808 = "torch.prim.ListConstruct"(%6805, %1485, %6806, %6807) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6809 = "torch.aten.view"(%6804, %6808) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6809, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6810 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6811 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6812 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6813 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6814 = "torch.aten.slice.Tensor"(%6791, %6810, %6811, %6812, %6813) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6814, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6815 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6816 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6817 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %6818 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6819 = "torch.aten.slice.Tensor"(%6809, %6815, %6816, %6817, %6818) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6819, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6820 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %6821 = "torch.aten.unsqueeze"(%6814, %6820) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6821, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6822 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6823 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6824 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6825 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6826 = "torch.prim.ListConstruct"(%6822, %1485, %6823, %6824, %6825) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6827 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6828 = "torch.aten.expand"(%6821, %6826, %6827) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6828, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6829 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6830 = "torch.aten.clone"(%6828, %6829) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6830, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6831 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6832 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6833 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6834 = "torch.prim.ListConstruct"(%6831, %1485, %6832, %6833) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6835 = "torch.aten._unsafe_view"(%6830, %6834) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6835, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6836 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %6837 = "torch.aten.unsqueeze"(%6819, %6836) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6837, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6838 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6839 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %6840 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6841 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6842 = "torch.prim.ListConstruct"(%6838, %1485, %6839, %6840, %6841) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6843 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6844 = "torch.aten.expand"(%6837, %6842, %6843) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6844, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6845 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6846 = "torch.aten.clone"(%6844, %6845) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6846, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6847 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6848 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %6849 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %6850 = "torch.prim.ListConstruct"(%6847, %1485, %6848, %6849) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6851 = "torch.aten._unsafe_view"(%6846, %6850) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6851, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6852 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6853 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6854 = "torch.aten.transpose.int"(%6636, %6852, %6853) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %6855 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6856 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6857 = "torch.aten.transpose.int"(%6835, %6855, %6856) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6857, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6858 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6859 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6860 = "torch.aten.transpose.int"(%6851, %6858, %6859) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6860, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %6861 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6862 = "torch.aten.squeeze.dim"(%1516, %6861) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6862, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %6863 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6864 = "torch.aten.squeeze.dim"(%6862, %6863) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%6864, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %6865 = "torch_c.to_builtin_tensor"(%6854) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %6866 = "tensor.cast"(%6865) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %6867 = "torch_c.to_builtin_tensor"(%6857) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %6868 = "torch_c.to_builtin_tensor"(%6860) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %6869 = "torch_c.to_builtin_tensor"(%6864) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %6870 = "tensor.cast"(%6869) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %6871 = "torch_c.to_builtin_tensor"(%572) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %6872 = "util.call"(%6866, %6867, %6868, %6871, %6870) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %6873 = "tensor.cast"(%6872) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %6874 = "torch_c.from_builtin_tensor"(%6873) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %6875 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6876 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6877 = "torch.aten.transpose.int"(%6874, %6875, %6876) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %6878 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6880 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6881 = "torch.prim.ListConstruct"(%6878, %6879, %6880) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6882 = "torch.aten.view"(%6877, %6881) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %6883 = "torch.aten.div.Tensor"(%6882, %574) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6884 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6885 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6886 = "torch.aten.clamp"(%6883, %6884, %6885) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %6887 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6888 = "torch.prims.convert_element_type"(%6886, %6887) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6889 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6890 = "torch.aten.unsqueeze"(%576, %6889) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %6891 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6892 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6893 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6894 = "torch.prim.ListConstruct"(%6891, %6892, %6893) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6895 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6896 = "torch.aten.expand"(%6890, %6894, %6895) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %6897 = "torch_c.to_builtin_tensor"(%6888) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6898 = "torch_c.to_builtin_tensor"(%6896) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %6899 = "util.call"(%6897, %6898) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %6900 = "torch_c.from_builtin_tensor"(%6899) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %6901 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6902 = "torch.prims.convert_element_type"(%6900, %6901) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6903 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6904 = "torch.aten.add.Tensor"(%6503, %6902, %6903) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6905 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %6906 = "torch.prims.convert_element_type"(%6904, %6905) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6907 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6908 = "torch.aten.pow.Tensor_Scalar"(%6906, %6907) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6909 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6910 = "torch.prim.ListConstruct"(%6909) : (!torch.int) -> !torch.list<int>
    %6911 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %6912 = "torch.constant.none"() : () -> !torch.none
    %6913 = "torch.aten.mean.dim"(%6908, %6910, %6911, %6912) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %6914 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %6915 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6916 = "torch.aten.add.Scalar"(%6913, %6914, %6915) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %6917 = "torch.aten.rsqrt"(%6916) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %6918 = "torch.aten.mul.Tensor"(%6906, %6917) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %6919 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6920 = "torch.prims.convert_element_type"(%6918, %6919) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6921 = "torch.aten.mul.Tensor"(%578, %6920) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %6922 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6923 = "torch.prims.convert_element_type"(%6921, %6922) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6924 = "torch.aten.div.Tensor"(%6923, %580) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6925 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6926 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6927 = "torch.aten.clamp"(%6924, %6925, %6926) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6928 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6929 = "torch.prims.convert_element_type"(%6927, %6928) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6930 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6931 = "torch.aten.unsqueeze"(%582, %6930) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %6932 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6933 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %6934 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6935 = "torch.prim.ListConstruct"(%6932, %6933, %6934) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6936 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6937 = "torch.aten.expand"(%6931, %6935, %6936) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %6938 = "torch_c.to_builtin_tensor"(%6929) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6939 = "torch_c.to_builtin_tensor"(%6937) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %6940 = "util.call"(%6938, %6939) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %6941 = "torch_c.from_builtin_tensor"(%6940) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %6942 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6943 = "torch.prims.convert_element_type"(%6941, %6942) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %6944 = "torch.aten.silu"(%6943) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %6945 = "torch.aten.div.Tensor"(%6923, %584) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %6946 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6947 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6948 = "torch.aten.clamp"(%6945, %6946, %6947) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %6949 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6950 = "torch.prims.convert_element_type"(%6948, %6949) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %6951 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6952 = "torch.aten.unsqueeze"(%586, %6951) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %6953 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6954 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %6955 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6956 = "torch.prim.ListConstruct"(%6953, %6954, %6955) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6957 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6958 = "torch.aten.expand"(%6952, %6956, %6957) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %6959 = "torch_c.to_builtin_tensor"(%6950) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %6960 = "torch_c.to_builtin_tensor"(%6958) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %6961 = "util.call"(%6959, %6960) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %6962 = "torch_c.from_builtin_tensor"(%6961) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %6963 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6964 = "torch.prims.convert_element_type"(%6962, %6963) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %6965 = "torch.aten.mul.Tensor"(%6944, %6964) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %6966 = "torch.aten.div.Tensor"(%6965, %588) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %6967 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %6968 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %6969 = "torch.aten.clamp"(%6966, %6967, %6968) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %6970 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %6971 = "torch.prims.convert_element_type"(%6969, %6970) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %6972 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %6973 = "torch.aten.unsqueeze"(%590, %6972) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %6974 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %6975 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %6976 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %6977 = "torch.prim.ListConstruct"(%6974, %6975, %6976) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6978 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %6979 = "torch.aten.expand"(%6973, %6977, %6978) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %6980 = "torch_c.to_builtin_tensor"(%6971) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %6981 = "torch_c.to_builtin_tensor"(%6979) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %6982 = "util.call"(%6980, %6981) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %6983 = "torch_c.from_builtin_tensor"(%6982) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %6984 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %6985 = "torch.prims.convert_element_type"(%6983, %6984) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6986 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6987 = "torch.aten.add.Tensor"(%6904, %6985, %6986) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %6988 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %6989 = "torch.prims.convert_element_type"(%6987, %6988) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6990 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %6991 = "torch.aten.pow.Tensor_Scalar"(%6989, %6990) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %6992 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %6993 = "torch.prim.ListConstruct"(%6992) : (!torch.int) -> !torch.list<int>
    %6994 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %6995 = "torch.constant.none"() : () -> !torch.none
    %6996 = "torch.aten.mean.dim"(%6991, %6993, %6994, %6995) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %6997 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %6998 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %6999 = "torch.aten.add.Scalar"(%6996, %6997, %6998) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %7000 = "torch.aten.rsqrt"(%6999) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %7001 = "torch.aten.mul.Tensor"(%6989, %7000) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7002 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7003 = "torch.prims.convert_element_type"(%7001, %7002) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7004 = "torch.aten.mul.Tensor"(%592, %7003) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %7005 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7006 = "torch.prims.convert_element_type"(%7004, %7005) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7007 = "torch.aten.div.Tensor"(%7006, %594) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7008 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7009 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7010 = "torch.aten.clamp"(%7007, %7008, %7009) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7011 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7012 = "torch.prims.convert_element_type"(%7010, %7011) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7013 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7014 = "torch.aten.unsqueeze"(%596, %7013) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %7015 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7016 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7017 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7018 = "torch.prim.ListConstruct"(%7015, %7016, %7017) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7019 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7020 = "torch.aten.expand"(%7014, %7018, %7019) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %7021 = "torch_c.to_builtin_tensor"(%7012) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7022 = "torch_c.to_builtin_tensor"(%7020) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %7023 = "util.call"(%7021, %7022) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %7024 = "torch_c.from_builtin_tensor"(%7023) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %7025 = "torch.aten.div.Tensor"(%7024, %598) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7026 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7027 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7028 = "torch.aten.clamp"(%7025, %7026, %7027) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %7029 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7030 = "torch.prims.convert_element_type"(%7028, %7029) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7031 = "torch.aten.div.Tensor"(%7006, %600) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7032 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7033 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7034 = "torch.aten.clamp"(%7031, %7032, %7033) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7035 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7036 = "torch.prims.convert_element_type"(%7034, %7035) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7037 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7038 = "torch.aten.unsqueeze"(%602, %7037) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %7039 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7040 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %7041 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7042 = "torch.prim.ListConstruct"(%7039, %7040, %7041) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7043 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7044 = "torch.aten.expand"(%7038, %7042, %7043) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %7045 = "torch_c.to_builtin_tensor"(%7036) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7046 = "torch_c.to_builtin_tensor"(%7044) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %7047 = "util.call"(%7045, %7046) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %7048 = "torch_c.from_builtin_tensor"(%7047) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %7049 = "torch.aten.div.Tensor"(%7048, %604) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %7050 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7051 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7052 = "torch.aten.clamp"(%7049, %7050, %7051) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %7053 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7054 = "torch.prims.convert_element_type"(%7052, %7053) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %7055 = "torch.aten.div.Tensor"(%7006, %606) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7056 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7057 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7058 = "torch.aten.clamp"(%7055, %7056, %7057) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7059 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7060 = "torch.prims.convert_element_type"(%7058, %7059) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7061 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7062 = "torch.aten.unsqueeze"(%608, %7061) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %7063 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7064 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %7065 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7066 = "torch.prim.ListConstruct"(%7063, %7064, %7065) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7067 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7068 = "torch.aten.expand"(%7062, %7066, %7067) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %7069 = "torch_c.to_builtin_tensor"(%7060) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7070 = "torch_c.to_builtin_tensor"(%7068) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %7071 = "util.call"(%7069, %7070) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %7072 = "torch_c.from_builtin_tensor"(%7071) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %7073 = "torch.aten.div.Tensor"(%7072, %610) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %7074 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7075 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7076 = "torch.aten.clamp"(%7073, %7074, %7075) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %7077 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7078 = "torch.prims.convert_element_type"(%7076, %7077) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %7079 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7080 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7081 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7082 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7083 = "torch.prim.ListConstruct"(%7079, %7080, %7081, %7082) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7084 = "torch.aten.view"(%7030, %7083) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %7085 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7086 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7087 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7088 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7089 = "torch.prim.ListConstruct"(%7085, %7086, %7087, %7088) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7090 = "torch.aten.view"(%7054, %7089) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %7091 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7092 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7093 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7094 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7095 = "torch.prim.ListConstruct"(%7091, %7092, %7093, %7094) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7096 = "torch.aten.view"(%7078, %7095) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %7097 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7098 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7099 = "torch.aten.transpose.int"(%7084, %7097, %7098) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7100 = "torch.aten.mul.Tensor"(%7099, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7101 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7102 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7103 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7104 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7105 = "torch.aten.slice.Tensor"(%7099, %7101, %7102, %7103, %7104) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %7106 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7107 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7108 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7109 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7110 = "torch.aten.slice.Tensor"(%7099, %7106, %7107, %7108, %7109) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %7111 = "torch.aten.neg"(%7110) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %7112 = "torch.prim.ListConstruct"(%7111, %7105) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %7113 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7114 = "torch.aten.cat"(%7112, %7113) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7115 = "torch.aten.mul.Tensor"(%7114, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7116 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7117 = "torch.aten.add.Tensor"(%7100, %7115, %7116) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7118 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7119 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7120 = "torch.aten.transpose.int"(%7117, %7118, %7119) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %7121 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7122 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7123 = "torch.aten.transpose.int"(%7090, %7121, %7122) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7124 = "torch.aten.mul.Tensor"(%7123, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7125 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7126 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7127 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7128 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7129 = "torch.aten.slice.Tensor"(%7123, %7125, %7126, %7127, %7128) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %7130 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7131 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7132 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7133 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7134 = "torch.aten.slice.Tensor"(%7123, %7130, %7131, %7132, %7133) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %7135 = "torch.aten.neg"(%7134) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %7136 = "torch.prim.ListConstruct"(%7135, %7129) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %7137 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7138 = "torch.aten.cat"(%7136, %7137) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7139 = "torch.aten.mul.Tensor"(%7138, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7140 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7141 = "torch.aten.add.Tensor"(%7124, %7139, %7140) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7142 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7143 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7144 = "torch.aten.transpose.int"(%7141, %7142, %7143) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %7145 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7146 = "torch.aten.floor_divide.Scalar"(%arg64, %7145) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7147 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7148 = "torch.aten.unsqueeze"(%7146, %7147) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7149 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7150 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7151 = "torch.aten.gather"(%arg65, %7149, %7148, %7150) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7152 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7153 = "torch.aten.remainder.Scalar"(%arg64, %7152) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7154 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7155 = "torch.aten.unsqueeze"(%7153, %7154) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7156 = "torch.constant.none"() : () -> !torch.none
    %7157 = "torch.aten.clone"(%611, %7156) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %7158 = "torch.aten.detach"(%7157) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7159 = "torch.aten.detach"(%7158) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7160 = "torch.aten.detach"(%7159) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7161 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7162 = "torch.aten.unsqueeze"(%7160, %7161) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %7163 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7164 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7165 = "torch.prim.ListConstruct"(%7163, %7164) : (!torch.int, !torch.int) -> !torch.list<int>
    %7166 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7167 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7168 = "torch.prim.ListConstruct"(%7166, %7167) : (!torch.int, !torch.int) -> !torch.list<int>
    %7169 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7170 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7171 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %7172 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7173 = "torch.aten.empty_strided"(%7165, %7168, %7169, %7170, %7171, %7172) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7174 = "torch.constant.int"() <{value = 11 : i64}> : () -> !torch.int
    %7175 = "torch.aten.fill.Scalar"(%7173, %7174) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7176 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7177 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7178 = "torch.prim.ListConstruct"(%7176, %7177) : (!torch.int, !torch.int) -> !torch.list<int>
    %7179 = "torch.aten.repeat"(%7162, %7178) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %7180 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7181 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7182 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7183 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7184 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7185 = "torch.prim.ListConstruct"(%1483, %7180, %7181, %7182, %7183, %7184) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7186 = "torch.aten.view"(%6756, %7185) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7186, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7187 = "torch.prim.ListConstruct"(%7151, %7175, %7179, %7155) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %7188 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7189 = "torch.aten.index_put"(%7186, %7187, %7144, %7188) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7189, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7190 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %7191 = "torch.prim.ListConstruct"(%1483, %7190) : (!torch.int, !torch.int) -> !torch.list<int>
    %7192 = "torch.aten.view"(%7189, %7191) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7192, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %7193 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7194 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7195 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7196 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7197 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7198 = "torch.prim.ListConstruct"(%1483, %7193, %7194, %7195, %7196, %7197) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7199 = "torch.aten.view"(%7192, %7198) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7199, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7200 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7201 = "torch.aten.floor_divide.Scalar"(%arg64, %7200) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7202 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7203 = "torch.aten.unsqueeze"(%7201, %7202) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7204 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7205 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7206 = "torch.aten.gather"(%arg65, %7204, %7203, %7205) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7207 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7208 = "torch.aten.remainder.Scalar"(%arg64, %7207) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7210 = "torch.aten.unsqueeze"(%7208, %7209) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7211 = "torch.constant.none"() : () -> !torch.none
    %7212 = "torch.aten.clone"(%612, %7211) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %7213 = "torch.aten.detach"(%7212) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7214 = "torch.aten.detach"(%7213) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7215 = "torch.aten.detach"(%7214) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7216 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7217 = "torch.aten.unsqueeze"(%7215, %7216) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %7218 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7219 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7220 = "torch.prim.ListConstruct"(%7218, %7219) : (!torch.int, !torch.int) -> !torch.list<int>
    %7221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7222 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7223 = "torch.prim.ListConstruct"(%7221, %7222) : (!torch.int, !torch.int) -> !torch.list<int>
    %7224 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7225 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7226 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %7227 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7228 = "torch.aten.empty_strided"(%7220, %7223, %7224, %7225, %7226, %7227) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7229 = "torch.constant.int"() <{value = 11 : i64}> : () -> !torch.int
    %7230 = "torch.aten.fill.Scalar"(%7228, %7229) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7231 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7232 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7233 = "torch.prim.ListConstruct"(%7231, %7232) : (!torch.int, !torch.int) -> !torch.list<int>
    %7234 = "torch.aten.repeat"(%7217, %7233) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %7235 = "torch.prim.ListConstruct"(%7206, %7230, %7234, %7210) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %7236 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7237 = "torch.aten.index_put"(%7199, %7235, %7096, %7236) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7237, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7238 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %7239 = "torch.prim.ListConstruct"(%1483, %7238) : (!torch.int, !torch.int) -> !torch.list<int>
    %7240 = "torch.aten.view"(%7237, %7239) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7240, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %7241 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7242 = "torch.aten.mul.Scalar"(%arg65, %7241) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7242, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7243 = "torch.constant.int"() <{value = 22 : i64}> : () -> !torch.int
    %7244 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7245 = "torch.aten.add.Scalar"(%7242, %7243, %7244) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7245, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7246 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7247 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7248 = "torch.aten.add.Scalar"(%7245, %7246, %7247) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7248, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7249 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %7250 = "torch.aten.view"(%7248, %7249) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%7250, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %7251 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7252 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7253 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7254 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7255 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7256 = "torch.prim.ListConstruct"(%1483, %7251, %7252, %7253, %7254, %7255) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7257 = "torch.aten.view"(%7240, %7256) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7257, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7258 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7259 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7260 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7261 = "torch.prim.ListConstruct"(%1914, %7258, %7259, %7260) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7262 = "torch.aten.view"(%7257, %7261) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7262, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7263 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7264 = "torch.aten.index_select"(%7262, %7263, %7250) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7264, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7265 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7266 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7267 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7268 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7269 = "torch.prim.ListConstruct"(%7265, %1481, %7266, %7267, %7268) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7270 = "torch.aten.view"(%7264, %7269) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7270, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7271 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7272 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7273 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7274 = "torch.prim.ListConstruct"(%7271, %1485, %7272, %7273) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7275 = "torch.aten.view"(%7270, %7274) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7275, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7276 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7278 = "torch.aten.add.Scalar"(%7245, %7276, %7277) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7278, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7279 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %7280 = "torch.aten.view"(%7278, %7279) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%7280, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %7281 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7282 = "torch.aten.index_select"(%7262, %7281, %7280) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7282, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7283 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7284 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7285 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7286 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7287 = "torch.prim.ListConstruct"(%7283, %1481, %7284, %7285, %7286) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7288 = "torch.aten.view"(%7282, %7287) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7288, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7289 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7290 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7291 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7292 = "torch.prim.ListConstruct"(%7289, %1485, %7290, %7291) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7293 = "torch.aten.view"(%7288, %7292) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7293, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7294 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7295 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7296 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7297 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7298 = "torch.aten.slice.Tensor"(%7275, %7294, %7295, %7296, %7297) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7298, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7299 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7300 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7301 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7302 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7303 = "torch.aten.slice.Tensor"(%7293, %7299, %7300, %7301, %7302) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7303, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7304 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %7305 = "torch.aten.unsqueeze"(%7298, %7304) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7305, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7306 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7307 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7308 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7309 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7310 = "torch.prim.ListConstruct"(%7306, %1485, %7307, %7308, %7309) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7311 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7312 = "torch.aten.expand"(%7305, %7310, %7311) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7312, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7313 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7314 = "torch.aten.clone"(%7312, %7313) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7314, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7315 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7316 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7317 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7318 = "torch.prim.ListConstruct"(%7315, %1485, %7316, %7317) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7319 = "torch.aten._unsafe_view"(%7314, %7318) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7319, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7320 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %7321 = "torch.aten.unsqueeze"(%7303, %7320) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7321, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7322 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7323 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7324 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7325 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7326 = "torch.prim.ListConstruct"(%7322, %1485, %7323, %7324, %7325) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7327 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7328 = "torch.aten.expand"(%7321, %7326, %7327) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7328, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7329 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7330 = "torch.aten.clone"(%7328, %7329) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7330, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7331 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7332 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7333 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7334 = "torch.prim.ListConstruct"(%7331, %1485, %7332, %7333) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7335 = "torch.aten._unsafe_view"(%7330, %7334) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7335, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7336 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7337 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7338 = "torch.aten.transpose.int"(%7120, %7336, %7337) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7339 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7340 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7341 = "torch.aten.transpose.int"(%7319, %7339, %7340) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7341, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7342 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7343 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7344 = "torch.aten.transpose.int"(%7335, %7342, %7343) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7344, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7345 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7346 = "torch.aten.squeeze.dim"(%1516, %7345) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7346, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %7347 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7348 = "torch.aten.squeeze.dim"(%7346, %7347) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7348, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %7349 = "torch_c.to_builtin_tensor"(%7338) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %7350 = "tensor.cast"(%7349) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %7351 = "torch_c.to_builtin_tensor"(%7341) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %7352 = "torch_c.to_builtin_tensor"(%7344) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %7353 = "torch_c.to_builtin_tensor"(%7348) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %7354 = "tensor.cast"(%7353) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %7355 = "torch_c.to_builtin_tensor"(%614) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %7356 = "util.call"(%7350, %7351, %7352, %7355, %7354) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %7357 = "tensor.cast"(%7356) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %7358 = "torch_c.from_builtin_tensor"(%7357) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %7359 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7360 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7361 = "torch.aten.transpose.int"(%7358, %7359, %7360) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %7362 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7363 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7364 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7365 = "torch.prim.ListConstruct"(%7362, %7363, %7364) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7366 = "torch.aten.view"(%7361, %7365) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %7367 = "torch.aten.div.Tensor"(%7366, %616) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7368 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7369 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7370 = "torch.aten.clamp"(%7367, %7368, %7369) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %7371 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7372 = "torch.prims.convert_element_type"(%7370, %7371) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7373 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7374 = "torch.aten.unsqueeze"(%618, %7373) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %7375 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7376 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7377 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7378 = "torch.prim.ListConstruct"(%7375, %7376, %7377) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7379 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7380 = "torch.aten.expand"(%7374, %7378, %7379) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %7381 = "torch_c.to_builtin_tensor"(%7372) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7382 = "torch_c.to_builtin_tensor"(%7380) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %7383 = "util.call"(%7381, %7382) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %7384 = "torch_c.from_builtin_tensor"(%7383) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %7385 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7386 = "torch.prims.convert_element_type"(%7384, %7385) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7387 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7388 = "torch.aten.add.Tensor"(%6987, %7386, %7387) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7389 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %7390 = "torch.prims.convert_element_type"(%7388, %7389) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7391 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7392 = "torch.aten.pow.Tensor_Scalar"(%7390, %7391) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7393 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7394 = "torch.prim.ListConstruct"(%7393) : (!torch.int) -> !torch.list<int>
    %7395 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %7396 = "torch.constant.none"() : () -> !torch.none
    %7397 = "torch.aten.mean.dim"(%7392, %7394, %7395, %7396) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %7398 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %7399 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7400 = "torch.aten.add.Scalar"(%7397, %7398, %7399) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %7401 = "torch.aten.rsqrt"(%7400) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %7402 = "torch.aten.mul.Tensor"(%7390, %7401) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7403 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7404 = "torch.prims.convert_element_type"(%7402, %7403) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7405 = "torch.aten.mul.Tensor"(%620, %7404) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %7406 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7407 = "torch.prims.convert_element_type"(%7405, %7406) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7408 = "torch.aten.div.Tensor"(%7407, %622) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7409 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7410 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7411 = "torch.aten.clamp"(%7408, %7409, %7410) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7412 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7413 = "torch.prims.convert_element_type"(%7411, %7412) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7414 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7415 = "torch.aten.unsqueeze"(%624, %7414) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %7416 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7417 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %7418 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7419 = "torch.prim.ListConstruct"(%7416, %7417, %7418) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7420 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7421 = "torch.aten.expand"(%7415, %7419, %7420) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %7422 = "torch_c.to_builtin_tensor"(%7413) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7423 = "torch_c.to_builtin_tensor"(%7421) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %7424 = "util.call"(%7422, %7423) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %7425 = "torch_c.from_builtin_tensor"(%7424) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %7426 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7427 = "torch.prims.convert_element_type"(%7425, %7426) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %7428 = "torch.aten.silu"(%7427) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %7429 = "torch.aten.div.Tensor"(%7407, %626) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7430 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7431 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7432 = "torch.aten.clamp"(%7429, %7430, %7431) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7433 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7434 = "torch.prims.convert_element_type"(%7432, %7433) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7435 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7436 = "torch.aten.unsqueeze"(%628, %7435) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %7437 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7438 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %7439 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7440 = "torch.prim.ListConstruct"(%7437, %7438, %7439) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7441 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7442 = "torch.aten.expand"(%7436, %7440, %7441) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %7443 = "torch_c.to_builtin_tensor"(%7434) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7444 = "torch_c.to_builtin_tensor"(%7442) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %7445 = "util.call"(%7443, %7444) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %7446 = "torch_c.from_builtin_tensor"(%7445) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %7447 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7448 = "torch.prims.convert_element_type"(%7446, %7447) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %7449 = "torch.aten.mul.Tensor"(%7428, %7448) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %7450 = "torch.aten.div.Tensor"(%7449, %630) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %7451 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7452 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7453 = "torch.aten.clamp"(%7450, %7451, %7452) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %7454 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7455 = "torch.prims.convert_element_type"(%7453, %7454) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %7456 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7457 = "torch.aten.unsqueeze"(%632, %7456) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %7458 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7459 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7460 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %7461 = "torch.prim.ListConstruct"(%7458, %7459, %7460) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7462 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7463 = "torch.aten.expand"(%7457, %7461, %7462) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %7464 = "torch_c.to_builtin_tensor"(%7455) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %7465 = "torch_c.to_builtin_tensor"(%7463) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %7466 = "util.call"(%7464, %7465) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %7467 = "torch_c.from_builtin_tensor"(%7466) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %7468 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7469 = "torch.prims.convert_element_type"(%7467, %7468) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7470 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7471 = "torch.aten.add.Tensor"(%7388, %7469, %7470) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7472 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %7473 = "torch.prims.convert_element_type"(%7471, %7472) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7474 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7475 = "torch.aten.pow.Tensor_Scalar"(%7473, %7474) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7476 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7477 = "torch.prim.ListConstruct"(%7476) : (!torch.int) -> !torch.list<int>
    %7478 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %7479 = "torch.constant.none"() : () -> !torch.none
    %7480 = "torch.aten.mean.dim"(%7475, %7477, %7478, %7479) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %7481 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %7482 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7483 = "torch.aten.add.Scalar"(%7480, %7481, %7482) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %7484 = "torch.aten.rsqrt"(%7483) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %7485 = "torch.aten.mul.Tensor"(%7473, %7484) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7486 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7487 = "torch.prims.convert_element_type"(%7485, %7486) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7488 = "torch.aten.mul.Tensor"(%634, %7487) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %7489 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7490 = "torch.prims.convert_element_type"(%7488, %7489) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7491 = "torch.aten.div.Tensor"(%7490, %636) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7492 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7493 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7494 = "torch.aten.clamp"(%7491, %7492, %7493) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7495 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7496 = "torch.prims.convert_element_type"(%7494, %7495) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7497 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7498 = "torch.aten.unsqueeze"(%638, %7497) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %7499 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7500 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7501 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7502 = "torch.prim.ListConstruct"(%7499, %7500, %7501) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7503 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7504 = "torch.aten.expand"(%7498, %7502, %7503) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %7505 = "torch_c.to_builtin_tensor"(%7496) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7506 = "torch_c.to_builtin_tensor"(%7504) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %7507 = "util.call"(%7505, %7506) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %7508 = "torch_c.from_builtin_tensor"(%7507) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %7509 = "torch.aten.div.Tensor"(%7508, %640) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7510 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7511 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7512 = "torch.aten.clamp"(%7509, %7510, %7511) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %7513 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7514 = "torch.prims.convert_element_type"(%7512, %7513) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7515 = "torch.aten.div.Tensor"(%7490, %642) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7516 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7517 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7518 = "torch.aten.clamp"(%7515, %7516, %7517) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7519 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7520 = "torch.prims.convert_element_type"(%7518, %7519) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7522 = "torch.aten.unsqueeze"(%644, %7521) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %7523 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7524 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %7525 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7526 = "torch.prim.ListConstruct"(%7523, %7524, %7525) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7527 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7528 = "torch.aten.expand"(%7522, %7526, %7527) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %7529 = "torch_c.to_builtin_tensor"(%7520) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7530 = "torch_c.to_builtin_tensor"(%7528) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %7531 = "util.call"(%7529, %7530) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %7532 = "torch_c.from_builtin_tensor"(%7531) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %7533 = "torch.aten.div.Tensor"(%7532, %646) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %7534 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7535 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7536 = "torch.aten.clamp"(%7533, %7534, %7535) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %7537 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7538 = "torch.prims.convert_element_type"(%7536, %7537) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %7539 = "torch.aten.div.Tensor"(%7490, %648) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7540 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7541 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7542 = "torch.aten.clamp"(%7539, %7540, %7541) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7543 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7544 = "torch.prims.convert_element_type"(%7542, %7543) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7545 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7546 = "torch.aten.unsqueeze"(%650, %7545) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %7547 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7548 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %7549 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7550 = "torch.prim.ListConstruct"(%7547, %7548, %7549) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7551 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7552 = "torch.aten.expand"(%7546, %7550, %7551) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %7553 = "torch_c.to_builtin_tensor"(%7544) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7554 = "torch_c.to_builtin_tensor"(%7552) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %7555 = "util.call"(%7553, %7554) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %7556 = "torch_c.from_builtin_tensor"(%7555) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %7557 = "torch.aten.div.Tensor"(%7556, %652) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %7558 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7559 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7560 = "torch.aten.clamp"(%7557, %7558, %7559) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %7561 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7562 = "torch.prims.convert_element_type"(%7560, %7561) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %7563 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7565 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7566 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7567 = "torch.prim.ListConstruct"(%7563, %7564, %7565, %7566) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7568 = "torch.aten.view"(%7514, %7567) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %7569 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7570 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7571 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7572 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7573 = "torch.prim.ListConstruct"(%7569, %7570, %7571, %7572) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7574 = "torch.aten.view"(%7538, %7573) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %7575 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7576 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7577 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7578 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7579 = "torch.prim.ListConstruct"(%7575, %7576, %7577, %7578) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7580 = "torch.aten.view"(%7562, %7579) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %7581 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7582 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7583 = "torch.aten.transpose.int"(%7568, %7581, %7582) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7584 = "torch.aten.mul.Tensor"(%7583, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7585 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7586 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7587 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7588 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7589 = "torch.aten.slice.Tensor"(%7583, %7585, %7586, %7587, %7588) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %7590 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7591 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7592 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7593 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7594 = "torch.aten.slice.Tensor"(%7583, %7590, %7591, %7592, %7593) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %7595 = "torch.aten.neg"(%7594) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %7596 = "torch.prim.ListConstruct"(%7595, %7589) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %7597 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7598 = "torch.aten.cat"(%7596, %7597) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7599 = "torch.aten.mul.Tensor"(%7598, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7600 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7601 = "torch.aten.add.Tensor"(%7584, %7599, %7600) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7602 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7603 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7604 = "torch.aten.transpose.int"(%7601, %7602, %7603) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %7605 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7606 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7607 = "torch.aten.transpose.int"(%7574, %7605, %7606) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7608 = "torch.aten.mul.Tensor"(%7607, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7609 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7610 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7611 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7612 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7613 = "torch.aten.slice.Tensor"(%7607, %7609, %7610, %7611, %7612) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %7614 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %7615 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7616 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7617 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7618 = "torch.aten.slice.Tensor"(%7607, %7614, %7615, %7616, %7617) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %7619 = "torch.aten.neg"(%7618) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %7620 = "torch.prim.ListConstruct"(%7619, %7613) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %7621 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7622 = "torch.aten.cat"(%7620, %7621) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7623 = "torch.aten.mul.Tensor"(%7622, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7624 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7625 = "torch.aten.add.Tensor"(%7608, %7623, %7624) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %7626 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7627 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7628 = "torch.aten.transpose.int"(%7625, %7626, %7627) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %7629 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7630 = "torch.aten.floor_divide.Scalar"(%arg64, %7629) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7631 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7632 = "torch.aten.unsqueeze"(%7630, %7631) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7633 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7634 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7635 = "torch.aten.gather"(%arg65, %7633, %7632, %7634) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7636 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7637 = "torch.aten.remainder.Scalar"(%arg64, %7636) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7638 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7639 = "torch.aten.unsqueeze"(%7637, %7638) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7640 = "torch.constant.none"() : () -> !torch.none
    %7641 = "torch.aten.clone"(%653, %7640) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %7642 = "torch.aten.detach"(%7641) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7643 = "torch.aten.detach"(%7642) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7644 = "torch.aten.detach"(%7643) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7645 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7646 = "torch.aten.unsqueeze"(%7644, %7645) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %7647 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7648 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7649 = "torch.prim.ListConstruct"(%7647, %7648) : (!torch.int, !torch.int) -> !torch.list<int>
    %7650 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7651 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7652 = "torch.prim.ListConstruct"(%7650, %7651) : (!torch.int, !torch.int) -> !torch.list<int>
    %7653 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7654 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7655 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %7656 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7657 = "torch.aten.empty_strided"(%7649, %7652, %7653, %7654, %7655, %7656) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7658 = "torch.constant.int"() <{value = 12 : i64}> : () -> !torch.int
    %7659 = "torch.aten.fill.Scalar"(%7657, %7658) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7660 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7661 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7662 = "torch.prim.ListConstruct"(%7660, %7661) : (!torch.int, !torch.int) -> !torch.list<int>
    %7663 = "torch.aten.repeat"(%7646, %7662) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %7664 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7665 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7666 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7667 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7668 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7669 = "torch.prim.ListConstruct"(%1483, %7664, %7665, %7666, %7667, %7668) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7670 = "torch.aten.view"(%7240, %7669) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7670, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7671 = "torch.prim.ListConstruct"(%7635, %7659, %7663, %7639) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %7672 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7673 = "torch.aten.index_put"(%7670, %7671, %7628, %7672) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7673, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7674 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %7675 = "torch.prim.ListConstruct"(%1483, %7674) : (!torch.int, !torch.int) -> !torch.list<int>
    %7676 = "torch.aten.view"(%7673, %7675) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7676, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %7677 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7678 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7679 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7680 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7681 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7682 = "torch.prim.ListConstruct"(%1483, %7677, %7678, %7679, %7680, %7681) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7683 = "torch.aten.view"(%7676, %7682) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7683, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7684 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7685 = "torch.aten.floor_divide.Scalar"(%arg64, %7684) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7686 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7687 = "torch.aten.unsqueeze"(%7685, %7686) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7688 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7689 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7690 = "torch.aten.gather"(%arg65, %7688, %7687, %7689) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7691 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7692 = "torch.aten.remainder.Scalar"(%arg64, %7691) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %7693 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7694 = "torch.aten.unsqueeze"(%7692, %7693) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7695 = "torch.constant.none"() : () -> !torch.none
    %7696 = "torch.aten.clone"(%654, %7695) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %7697 = "torch.aten.detach"(%7696) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7698 = "torch.aten.detach"(%7697) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7699 = "torch.aten.detach"(%7698) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %7700 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7701 = "torch.aten.unsqueeze"(%7699, %7700) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %7702 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7703 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7704 = "torch.prim.ListConstruct"(%7702, %7703) : (!torch.int, !torch.int) -> !torch.list<int>
    %7705 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7706 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7707 = "torch.prim.ListConstruct"(%7705, %7706) : (!torch.int, !torch.int) -> !torch.list<int>
    %7708 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7709 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7710 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %7711 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7712 = "torch.aten.empty_strided"(%7704, %7707, %7708, %7709, %7710, %7711) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %7713 = "torch.constant.int"() <{value = 12 : i64}> : () -> !torch.int
    %7714 = "torch.aten.fill.Scalar"(%7712, %7713) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %7715 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7716 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7717 = "torch.prim.ListConstruct"(%7715, %7716) : (!torch.int, !torch.int) -> !torch.list<int>
    %7718 = "torch.aten.repeat"(%7701, %7717) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %7719 = "torch.prim.ListConstruct"(%7690, %7714, %7718, %7694) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %7720 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7721 = "torch.aten.index_put"(%7683, %7719, %7580, %7720) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7721, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7722 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %7723 = "torch.prim.ListConstruct"(%1483, %7722) : (!torch.int, !torch.int) -> !torch.list<int>
    %7724 = "torch.aten.view"(%7721, %7723) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7724, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %7725 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %7726 = "torch.aten.mul.Scalar"(%arg65, %7725) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7726, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7727 = "torch.constant.int"() <{value = 24 : i64}> : () -> !torch.int
    %7728 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7729 = "torch.aten.add.Scalar"(%7726, %7727, %7728) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7729, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7730 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7731 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7732 = "torch.aten.add.Scalar"(%7729, %7730, %7731) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7732, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7733 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %7734 = "torch.aten.view"(%7732, %7733) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%7734, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %7735 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7736 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7737 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7738 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7739 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7740 = "torch.prim.ListConstruct"(%1483, %7735, %7736, %7737, %7738, %7739) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7741 = "torch.aten.view"(%7724, %7740) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7741, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7742 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7743 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7744 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7745 = "torch.prim.ListConstruct"(%1914, %7742, %7743, %7744) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7746 = "torch.aten.view"(%7741, %7745) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7746, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7747 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7748 = "torch.aten.index_select"(%7746, %7747, %7734) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7748, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7749 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7750 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7751 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7752 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7753 = "torch.prim.ListConstruct"(%7749, %1481, %7750, %7751, %7752) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7754 = "torch.aten.view"(%7748, %7753) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7754, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7755 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7756 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7757 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7758 = "torch.prim.ListConstruct"(%7755, %1485, %7756, %7757) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7759 = "torch.aten.view"(%7754, %7758) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7759, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7761 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7762 = "torch.aten.add.Scalar"(%7729, %7760, %7761) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%7762, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %7763 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %7764 = "torch.aten.view"(%7762, %7763) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%7764, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %7765 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7766 = "torch.aten.index_select"(%7746, %7765, %7764) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7766, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7767 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7768 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7769 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7770 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7771 = "torch.prim.ListConstruct"(%7767, %1481, %7768, %7769, %7770) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7772 = "torch.aten.view"(%7766, %7771) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7772, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7773 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7774 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7775 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7776 = "torch.prim.ListConstruct"(%7773, %1485, %7774, %7775) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7777 = "torch.aten.view"(%7772, %7776) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7777, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7778 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7779 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7780 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7781 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7782 = "torch.aten.slice.Tensor"(%7759, %7778, %7779, %7780, %7781) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7782, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7783 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7784 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7785 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %7786 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7787 = "torch.aten.slice.Tensor"(%7777, %7783, %7784, %7785, %7786) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7787, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7788 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %7789 = "torch.aten.unsqueeze"(%7782, %7788) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7789, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7790 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7791 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7792 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7793 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7794 = "torch.prim.ListConstruct"(%7790, %1485, %7791, %7792, %7793) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7795 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7796 = "torch.aten.expand"(%7789, %7794, %7795) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7796, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7797 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7798 = "torch.aten.clone"(%7796, %7797) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7798, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7799 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7800 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7801 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7802 = "torch.prim.ListConstruct"(%7799, %1485, %7800, %7801) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7803 = "torch.aten._unsafe_view"(%7798, %7802) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7803, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7804 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %7805 = "torch.aten.unsqueeze"(%7787, %7804) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7805, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7806 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7807 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %7808 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7809 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7810 = "torch.prim.ListConstruct"(%7806, %1485, %7807, %7808, %7809) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7811 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7812 = "torch.aten.expand"(%7805, %7810, %7811) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7812, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7813 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7814 = "torch.aten.clone"(%7812, %7813) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7814, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7815 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7816 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %7817 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %7818 = "torch.prim.ListConstruct"(%7815, %1485, %7816, %7817) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7819 = "torch.aten._unsafe_view"(%7814, %7818) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7819, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7820 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7821 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7822 = "torch.aten.transpose.int"(%7604, %7820, %7821) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %7823 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7824 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7825 = "torch.aten.transpose.int"(%7803, %7823, %7824) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7825, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7826 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7827 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7828 = "torch.aten.transpose.int"(%7819, %7826, %7827) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7828, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %7829 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7830 = "torch.aten.squeeze.dim"(%1516, %7829) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7830, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %7831 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7832 = "torch.aten.squeeze.dim"(%7830, %7831) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%7832, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %7833 = "torch_c.to_builtin_tensor"(%7822) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %7834 = "tensor.cast"(%7833) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %7835 = "torch_c.to_builtin_tensor"(%7825) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %7836 = "torch_c.to_builtin_tensor"(%7828) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %7837 = "torch_c.to_builtin_tensor"(%7832) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %7838 = "tensor.cast"(%7837) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %7839 = "torch_c.to_builtin_tensor"(%656) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %7840 = "util.call"(%7834, %7835, %7836, %7839, %7838) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %7841 = "tensor.cast"(%7840) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %7842 = "torch_c.from_builtin_tensor"(%7841) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %7843 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7844 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7845 = "torch.aten.transpose.int"(%7842, %7843, %7844) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %7846 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7847 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7848 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7849 = "torch.prim.ListConstruct"(%7846, %7847, %7848) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7850 = "torch.aten.view"(%7845, %7849) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %7851 = "torch.aten.div.Tensor"(%7850, %658) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7852 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7853 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7854 = "torch.aten.clamp"(%7851, %7852, %7853) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %7855 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7856 = "torch.prims.convert_element_type"(%7854, %7855) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7857 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7858 = "torch.aten.unsqueeze"(%660, %7857) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %7859 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7860 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7861 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7862 = "torch.prim.ListConstruct"(%7859, %7860, %7861) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7863 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7864 = "torch.aten.expand"(%7858, %7862, %7863) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %7865 = "torch_c.to_builtin_tensor"(%7856) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7866 = "torch_c.to_builtin_tensor"(%7864) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %7867 = "util.call"(%7865, %7866) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %7868 = "torch_c.from_builtin_tensor"(%7867) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %7869 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7870 = "torch.prims.convert_element_type"(%7868, %7869) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7871 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7872 = "torch.aten.add.Tensor"(%7471, %7870, %7871) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7873 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %7874 = "torch.prims.convert_element_type"(%7872, %7873) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7875 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7876 = "torch.aten.pow.Tensor_Scalar"(%7874, %7875) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7877 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7878 = "torch.prim.ListConstruct"(%7877) : (!torch.int) -> !torch.list<int>
    %7879 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %7880 = "torch.constant.none"() : () -> !torch.none
    %7881 = "torch.aten.mean.dim"(%7876, %7878, %7879, %7880) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %7882 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %7883 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7884 = "torch.aten.add.Scalar"(%7881, %7882, %7883) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %7885 = "torch.aten.rsqrt"(%7884) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %7886 = "torch.aten.mul.Tensor"(%7874, %7885) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7887 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7888 = "torch.prims.convert_element_type"(%7886, %7887) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7889 = "torch.aten.mul.Tensor"(%662, %7888) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %7890 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7891 = "torch.prims.convert_element_type"(%7889, %7890) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7892 = "torch.aten.div.Tensor"(%7891, %664) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7893 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7894 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7895 = "torch.aten.clamp"(%7892, %7893, %7894) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7896 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7897 = "torch.prims.convert_element_type"(%7895, %7896) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7898 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7899 = "torch.aten.unsqueeze"(%666, %7898) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %7900 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7901 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %7902 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7903 = "torch.prim.ListConstruct"(%7900, %7901, %7902) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7904 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7905 = "torch.aten.expand"(%7899, %7903, %7904) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %7906 = "torch_c.to_builtin_tensor"(%7897) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7907 = "torch_c.to_builtin_tensor"(%7905) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %7908 = "util.call"(%7906, %7907) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %7909 = "torch_c.from_builtin_tensor"(%7908) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %7910 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7911 = "torch.prims.convert_element_type"(%7909, %7910) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %7912 = "torch.aten.silu"(%7911) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %7913 = "torch.aten.div.Tensor"(%7891, %668) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7914 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7915 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7916 = "torch.aten.clamp"(%7913, %7914, %7915) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7917 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7918 = "torch.prims.convert_element_type"(%7916, %7917) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7919 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7920 = "torch.aten.unsqueeze"(%670, %7919) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %7921 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7922 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %7923 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7924 = "torch.prim.ListConstruct"(%7921, %7922, %7923) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7925 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7926 = "torch.aten.expand"(%7920, %7924, %7925) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %7927 = "torch_c.to_builtin_tensor"(%7918) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7928 = "torch_c.to_builtin_tensor"(%7926) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %7929 = "util.call"(%7927, %7928) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %7930 = "torch_c.from_builtin_tensor"(%7929) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %7931 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7932 = "torch.prims.convert_element_type"(%7930, %7931) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %7933 = "torch.aten.mul.Tensor"(%7912, %7932) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %7934 = "torch.aten.div.Tensor"(%7933, %672) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %7935 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7936 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7937 = "torch.aten.clamp"(%7934, %7935, %7936) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %7938 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7939 = "torch.prims.convert_element_type"(%7937, %7938) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %7940 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7941 = "torch.aten.unsqueeze"(%674, %7940) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %7942 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7943 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7944 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %7945 = "torch.prim.ListConstruct"(%7942, %7943, %7944) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7946 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7947 = "torch.aten.expand"(%7941, %7945, %7946) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %7948 = "torch_c.to_builtin_tensor"(%7939) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %7949 = "torch_c.to_builtin_tensor"(%7947) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %7950 = "util.call"(%7948, %7949) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %7951 = "torch_c.from_builtin_tensor"(%7950) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %7952 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7953 = "torch.prims.convert_element_type"(%7951, %7952) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7954 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7955 = "torch.aten.add.Tensor"(%7872, %7953, %7954) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7956 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %7957 = "torch.prims.convert_element_type"(%7955, %7956) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7958 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %7959 = "torch.aten.pow.Tensor_Scalar"(%7957, %7958) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %7960 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %7961 = "torch.prim.ListConstruct"(%7960) : (!torch.int) -> !torch.list<int>
    %7962 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %7963 = "torch.constant.none"() : () -> !torch.none
    %7964 = "torch.aten.mean.dim"(%7959, %7961, %7962, %7963) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %7965 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %7966 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %7967 = "torch.aten.add.Scalar"(%7964, %7965, %7966) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %7968 = "torch.aten.rsqrt"(%7967) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %7969 = "torch.aten.mul.Tensor"(%7957, %7968) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7970 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7971 = "torch.prims.convert_element_type"(%7969, %7970) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7972 = "torch.aten.mul.Tensor"(%676, %7971) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %7973 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %7974 = "torch.prims.convert_element_type"(%7972, %7973) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %7975 = "torch.aten.div.Tensor"(%7974, %678) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %7976 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7977 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7978 = "torch.aten.clamp"(%7975, %7976, %7977) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %7979 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7980 = "torch.prims.convert_element_type"(%7978, %7979) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7981 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %7982 = "torch.aten.unsqueeze"(%680, %7981) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %7983 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %7984 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7985 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %7986 = "torch.prim.ListConstruct"(%7983, %7984, %7985) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7987 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %7988 = "torch.aten.expand"(%7982, %7986, %7987) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %7989 = "torch_c.to_builtin_tensor"(%7980) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %7990 = "torch_c.to_builtin_tensor"(%7988) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %7991 = "util.call"(%7989, %7990) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %7992 = "torch_c.from_builtin_tensor"(%7991) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %7993 = "torch.aten.div.Tensor"(%7992, %682) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %7994 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %7995 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %7996 = "torch.aten.clamp"(%7993, %7994, %7995) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %7997 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %7998 = "torch.prims.convert_element_type"(%7996, %7997) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %7999 = "torch.aten.div.Tensor"(%7974, %684) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8000 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8001 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8002 = "torch.aten.clamp"(%7999, %8000, %8001) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8003 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8004 = "torch.prims.convert_element_type"(%8002, %8003) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8005 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8006 = "torch.aten.unsqueeze"(%686, %8005) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %8007 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8008 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %8009 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8010 = "torch.prim.ListConstruct"(%8007, %8008, %8009) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8011 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8012 = "torch.aten.expand"(%8006, %8010, %8011) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %8013 = "torch_c.to_builtin_tensor"(%8004) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8014 = "torch_c.to_builtin_tensor"(%8012) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %8015 = "util.call"(%8013, %8014) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %8016 = "torch_c.from_builtin_tensor"(%8015) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %8017 = "torch.aten.div.Tensor"(%8016, %688) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %8018 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8019 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8020 = "torch.aten.clamp"(%8017, %8018, %8019) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %8021 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8022 = "torch.prims.convert_element_type"(%8020, %8021) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %8023 = "torch.aten.div.Tensor"(%7974, %690) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8024 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8025 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8026 = "torch.aten.clamp"(%8023, %8024, %8025) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8027 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8028 = "torch.prims.convert_element_type"(%8026, %8027) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8029 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8030 = "torch.aten.unsqueeze"(%692, %8029) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %8031 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8032 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %8033 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8034 = "torch.prim.ListConstruct"(%8031, %8032, %8033) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8035 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8036 = "torch.aten.expand"(%8030, %8034, %8035) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %8037 = "torch_c.to_builtin_tensor"(%8028) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8038 = "torch_c.to_builtin_tensor"(%8036) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %8039 = "util.call"(%8037, %8038) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %8040 = "torch_c.from_builtin_tensor"(%8039) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %8041 = "torch.aten.div.Tensor"(%8040, %694) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %8042 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8043 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8044 = "torch.aten.clamp"(%8041, %8042, %8043) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %8045 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8046 = "torch.prims.convert_element_type"(%8044, %8045) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %8047 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8048 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8049 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8050 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8051 = "torch.prim.ListConstruct"(%8047, %8048, %8049, %8050) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8052 = "torch.aten.view"(%7998, %8051) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %8053 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8054 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8055 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8056 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8057 = "torch.prim.ListConstruct"(%8053, %8054, %8055, %8056) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8058 = "torch.aten.view"(%8022, %8057) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %8059 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8060 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8061 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8062 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8063 = "torch.prim.ListConstruct"(%8059, %8060, %8061, %8062) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8064 = "torch.aten.view"(%8046, %8063) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %8065 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8066 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8067 = "torch.aten.transpose.int"(%8052, %8065, %8066) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8068 = "torch.aten.mul.Tensor"(%8067, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8069 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8070 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8071 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8072 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8073 = "torch.aten.slice.Tensor"(%8067, %8069, %8070, %8071, %8072) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %8074 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8075 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8076 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8077 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8078 = "torch.aten.slice.Tensor"(%8067, %8074, %8075, %8076, %8077) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %8079 = "torch.aten.neg"(%8078) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %8080 = "torch.prim.ListConstruct"(%8079, %8073) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %8081 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8082 = "torch.aten.cat"(%8080, %8081) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8083 = "torch.aten.mul.Tensor"(%8082, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8084 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8085 = "torch.aten.add.Tensor"(%8068, %8083, %8084) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8086 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8087 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8088 = "torch.aten.transpose.int"(%8085, %8086, %8087) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %8089 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8090 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8091 = "torch.aten.transpose.int"(%8058, %8089, %8090) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8092 = "torch.aten.mul.Tensor"(%8091, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8093 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8094 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8095 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8096 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8097 = "torch.aten.slice.Tensor"(%8091, %8093, %8094, %8095, %8096) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %8098 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8099 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8100 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8101 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8102 = "torch.aten.slice.Tensor"(%8091, %8098, %8099, %8100, %8101) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %8103 = "torch.aten.neg"(%8102) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %8104 = "torch.prim.ListConstruct"(%8103, %8097) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %8105 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8106 = "torch.aten.cat"(%8104, %8105) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8107 = "torch.aten.mul.Tensor"(%8106, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8108 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8109 = "torch.aten.add.Tensor"(%8092, %8107, %8108) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8110 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8111 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8112 = "torch.aten.transpose.int"(%8109, %8110, %8111) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %8113 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8114 = "torch.aten.floor_divide.Scalar"(%arg64, %8113) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8115 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8116 = "torch.aten.unsqueeze"(%8114, %8115) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8117 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8118 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8119 = "torch.aten.gather"(%arg65, %8117, %8116, %8118) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8120 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8121 = "torch.aten.remainder.Scalar"(%arg64, %8120) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8122 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8123 = "torch.aten.unsqueeze"(%8121, %8122) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8124 = "torch.constant.none"() : () -> !torch.none
    %8125 = "torch.aten.clone"(%695, %8124) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %8126 = "torch.aten.detach"(%8125) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8127 = "torch.aten.detach"(%8126) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8128 = "torch.aten.detach"(%8127) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8129 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8130 = "torch.aten.unsqueeze"(%8128, %8129) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %8131 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8132 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8133 = "torch.prim.ListConstruct"(%8131, %8132) : (!torch.int, !torch.int) -> !torch.list<int>
    %8134 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8135 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8136 = "torch.prim.ListConstruct"(%8134, %8135) : (!torch.int, !torch.int) -> !torch.list<int>
    %8137 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8138 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8139 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %8140 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8141 = "torch.aten.empty_strided"(%8133, %8136, %8137, %8138, %8139, %8140) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8142 = "torch.constant.int"() <{value = 13 : i64}> : () -> !torch.int
    %8143 = "torch.aten.fill.Scalar"(%8141, %8142) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8144 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8145 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8146 = "torch.prim.ListConstruct"(%8144, %8145) : (!torch.int, !torch.int) -> !torch.list<int>
    %8147 = "torch.aten.repeat"(%8130, %8146) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %8148 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8149 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8150 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8151 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8152 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8153 = "torch.prim.ListConstruct"(%1483, %8148, %8149, %8150, %8151, %8152) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8154 = "torch.aten.view"(%7724, %8153) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8154, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8155 = "torch.prim.ListConstruct"(%8119, %8143, %8147, %8123) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %8156 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8157 = "torch.aten.index_put"(%8154, %8155, %8112, %8156) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8157, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8158 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %8159 = "torch.prim.ListConstruct"(%1483, %8158) : (!torch.int, !torch.int) -> !torch.list<int>
    %8160 = "torch.aten.view"(%8157, %8159) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8160, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %8161 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8162 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8163 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8164 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8165 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8166 = "torch.prim.ListConstruct"(%1483, %8161, %8162, %8163, %8164, %8165) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8167 = "torch.aten.view"(%8160, %8166) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8167, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8168 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8169 = "torch.aten.floor_divide.Scalar"(%arg64, %8168) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8170 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8171 = "torch.aten.unsqueeze"(%8169, %8170) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8172 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8173 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8174 = "torch.aten.gather"(%arg65, %8172, %8171, %8173) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8175 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8176 = "torch.aten.remainder.Scalar"(%arg64, %8175) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8177 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8178 = "torch.aten.unsqueeze"(%8176, %8177) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8179 = "torch.constant.none"() : () -> !torch.none
    %8180 = "torch.aten.clone"(%696, %8179) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %8181 = "torch.aten.detach"(%8180) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8182 = "torch.aten.detach"(%8181) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8183 = "torch.aten.detach"(%8182) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8184 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8185 = "torch.aten.unsqueeze"(%8183, %8184) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %8186 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8187 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8188 = "torch.prim.ListConstruct"(%8186, %8187) : (!torch.int, !torch.int) -> !torch.list<int>
    %8189 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8190 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8191 = "torch.prim.ListConstruct"(%8189, %8190) : (!torch.int, !torch.int) -> !torch.list<int>
    %8192 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8193 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8194 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %8195 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8196 = "torch.aten.empty_strided"(%8188, %8191, %8192, %8193, %8194, %8195) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8197 = "torch.constant.int"() <{value = 13 : i64}> : () -> !torch.int
    %8198 = "torch.aten.fill.Scalar"(%8196, %8197) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8199 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8200 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8201 = "torch.prim.ListConstruct"(%8199, %8200) : (!torch.int, !torch.int) -> !torch.list<int>
    %8202 = "torch.aten.repeat"(%8185, %8201) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %8203 = "torch.prim.ListConstruct"(%8174, %8198, %8202, %8178) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %8204 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8205 = "torch.aten.index_put"(%8167, %8203, %8064, %8204) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8205, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8206 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %8207 = "torch.prim.ListConstruct"(%1483, %8206) : (!torch.int, !torch.int) -> !torch.list<int>
    %8208 = "torch.aten.view"(%8205, %8207) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8208, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %8209 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8210 = "torch.aten.mul.Scalar"(%arg65, %8209) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8210, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8211 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8212 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8213 = "torch.aten.add.Scalar"(%8210, %8211, %8212) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8213, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8214 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8215 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8216 = "torch.aten.add.Scalar"(%8213, %8214, %8215) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8216, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8217 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %8218 = "torch.aten.view"(%8216, %8217) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%8218, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %8219 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8220 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8221 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8222 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8223 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8224 = "torch.prim.ListConstruct"(%1483, %8219, %8220, %8221, %8222, %8223) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8225 = "torch.aten.view"(%8208, %8224) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8225, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8226 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8227 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8228 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8229 = "torch.prim.ListConstruct"(%1914, %8226, %8227, %8228) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8230 = "torch.aten.view"(%8225, %8229) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8230, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8231 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8232 = "torch.aten.index_select"(%8230, %8231, %8218) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8232, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8233 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8234 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8235 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8236 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8237 = "torch.prim.ListConstruct"(%8233, %1481, %8234, %8235, %8236) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8238 = "torch.aten.view"(%8232, %8237) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8238, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8239 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8240 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8241 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8242 = "torch.prim.ListConstruct"(%8239, %1485, %8240, %8241) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8243 = "torch.aten.view"(%8238, %8242) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8243, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8244 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8245 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8246 = "torch.aten.add.Scalar"(%8213, %8244, %8245) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8246, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8247 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %8248 = "torch.aten.view"(%8246, %8247) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%8248, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %8249 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8250 = "torch.aten.index_select"(%8230, %8249, %8248) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8250, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8251 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8252 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8253 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8254 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8255 = "torch.prim.ListConstruct"(%8251, %1481, %8252, %8253, %8254) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8256 = "torch.aten.view"(%8250, %8255) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8256, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8257 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8258 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8259 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8260 = "torch.prim.ListConstruct"(%8257, %1485, %8258, %8259) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8261 = "torch.aten.view"(%8256, %8260) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8261, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8262 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8263 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8264 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8265 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8266 = "torch.aten.slice.Tensor"(%8243, %8262, %8263, %8264, %8265) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8266, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8267 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8268 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8269 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8271 = "torch.aten.slice.Tensor"(%8261, %8267, %8268, %8269, %8270) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8271, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8272 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %8273 = "torch.aten.unsqueeze"(%8266, %8272) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8273, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8274 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8275 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8276 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8277 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8278 = "torch.prim.ListConstruct"(%8274, %1485, %8275, %8276, %8277) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8279 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8280 = "torch.aten.expand"(%8273, %8278, %8279) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8280, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8281 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8282 = "torch.aten.clone"(%8280, %8281) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8282, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8283 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8284 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8285 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8286 = "torch.prim.ListConstruct"(%8283, %1485, %8284, %8285) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8287 = "torch.aten._unsafe_view"(%8282, %8286) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8287, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8288 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %8289 = "torch.aten.unsqueeze"(%8271, %8288) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8289, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8290 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8291 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8292 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8293 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8294 = "torch.prim.ListConstruct"(%8290, %1485, %8291, %8292, %8293) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8295 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8296 = "torch.aten.expand"(%8289, %8294, %8295) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8296, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8297 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8298 = "torch.aten.clone"(%8296, %8297) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8298, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8299 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8300 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8301 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8302 = "torch.prim.ListConstruct"(%8299, %1485, %8300, %8301) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8303 = "torch.aten._unsafe_view"(%8298, %8302) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8303, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8304 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8305 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8306 = "torch.aten.transpose.int"(%8088, %8304, %8305) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8307 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8308 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8309 = "torch.aten.transpose.int"(%8287, %8307, %8308) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8309, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8310 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8311 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8312 = "torch.aten.transpose.int"(%8303, %8310, %8311) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8312, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8313 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8314 = "torch.aten.squeeze.dim"(%1516, %8313) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8314, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %8315 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8316 = "torch.aten.squeeze.dim"(%8314, %8315) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8316, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %8317 = "torch_c.to_builtin_tensor"(%8306) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %8318 = "tensor.cast"(%8317) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %8319 = "torch_c.to_builtin_tensor"(%8309) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %8320 = "torch_c.to_builtin_tensor"(%8312) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %8321 = "torch_c.to_builtin_tensor"(%8316) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %8322 = "tensor.cast"(%8321) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %8323 = "torch_c.to_builtin_tensor"(%698) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %8324 = "util.call"(%8318, %8319, %8320, %8323, %8322) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %8325 = "tensor.cast"(%8324) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %8326 = "torch_c.from_builtin_tensor"(%8325) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %8327 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8328 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8329 = "torch.aten.transpose.int"(%8326, %8327, %8328) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %8330 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8331 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8332 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8333 = "torch.prim.ListConstruct"(%8330, %8331, %8332) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8334 = "torch.aten.view"(%8329, %8333) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %8335 = "torch.aten.div.Tensor"(%8334, %700) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8336 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8337 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8338 = "torch.aten.clamp"(%8335, %8336, %8337) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %8339 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8340 = "torch.prims.convert_element_type"(%8338, %8339) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8341 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8342 = "torch.aten.unsqueeze"(%702, %8341) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %8343 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8344 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8345 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8346 = "torch.prim.ListConstruct"(%8343, %8344, %8345) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8347 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8348 = "torch.aten.expand"(%8342, %8346, %8347) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %8349 = "torch_c.to_builtin_tensor"(%8340) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8350 = "torch_c.to_builtin_tensor"(%8348) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %8351 = "util.call"(%8349, %8350) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %8352 = "torch_c.from_builtin_tensor"(%8351) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %8353 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8354 = "torch.prims.convert_element_type"(%8352, %8353) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8355 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8356 = "torch.aten.add.Tensor"(%7955, %8354, %8355) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8357 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %8358 = "torch.prims.convert_element_type"(%8356, %8357) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8359 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8360 = "torch.aten.pow.Tensor_Scalar"(%8358, %8359) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8361 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8362 = "torch.prim.ListConstruct"(%8361) : (!torch.int) -> !torch.list<int>
    %8363 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %8364 = "torch.constant.none"() : () -> !torch.none
    %8365 = "torch.aten.mean.dim"(%8360, %8362, %8363, %8364) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %8366 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %8367 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8368 = "torch.aten.add.Scalar"(%8365, %8366, %8367) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %8369 = "torch.aten.rsqrt"(%8368) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %8370 = "torch.aten.mul.Tensor"(%8358, %8369) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8371 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8372 = "torch.prims.convert_element_type"(%8370, %8371) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8373 = "torch.aten.mul.Tensor"(%704, %8372) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %8374 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8375 = "torch.prims.convert_element_type"(%8373, %8374) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8376 = "torch.aten.div.Tensor"(%8375, %706) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8377 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8378 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8379 = "torch.aten.clamp"(%8376, %8377, %8378) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8380 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8381 = "torch.prims.convert_element_type"(%8379, %8380) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8382 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8383 = "torch.aten.unsqueeze"(%708, %8382) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %8384 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8385 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %8386 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8387 = "torch.prim.ListConstruct"(%8384, %8385, %8386) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8388 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8389 = "torch.aten.expand"(%8383, %8387, %8388) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %8390 = "torch_c.to_builtin_tensor"(%8381) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8391 = "torch_c.to_builtin_tensor"(%8389) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %8392 = "util.call"(%8390, %8391) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %8393 = "torch_c.from_builtin_tensor"(%8392) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %8394 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8395 = "torch.prims.convert_element_type"(%8393, %8394) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %8396 = "torch.aten.silu"(%8395) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %8397 = "torch.aten.div.Tensor"(%8375, %710) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8398 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8399 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8400 = "torch.aten.clamp"(%8397, %8398, %8399) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8401 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8402 = "torch.prims.convert_element_type"(%8400, %8401) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8403 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8404 = "torch.aten.unsqueeze"(%712, %8403) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %8405 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8406 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %8407 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8408 = "torch.prim.ListConstruct"(%8405, %8406, %8407) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8409 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8410 = "torch.aten.expand"(%8404, %8408, %8409) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %8411 = "torch_c.to_builtin_tensor"(%8402) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8412 = "torch_c.to_builtin_tensor"(%8410) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %8413 = "util.call"(%8411, %8412) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %8414 = "torch_c.from_builtin_tensor"(%8413) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %8415 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8416 = "torch.prims.convert_element_type"(%8414, %8415) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %8417 = "torch.aten.mul.Tensor"(%8396, %8416) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %8418 = "torch.aten.div.Tensor"(%8417, %714) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %8419 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8420 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8421 = "torch.aten.clamp"(%8418, %8419, %8420) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %8422 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8423 = "torch.prims.convert_element_type"(%8421, %8422) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %8424 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8425 = "torch.aten.unsqueeze"(%716, %8424) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %8426 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8427 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8428 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %8429 = "torch.prim.ListConstruct"(%8426, %8427, %8428) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8430 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8431 = "torch.aten.expand"(%8425, %8429, %8430) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %8432 = "torch_c.to_builtin_tensor"(%8423) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %8433 = "torch_c.to_builtin_tensor"(%8431) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %8434 = "util.call"(%8432, %8433) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %8435 = "torch_c.from_builtin_tensor"(%8434) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %8436 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8437 = "torch.prims.convert_element_type"(%8435, %8436) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8438 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8439 = "torch.aten.add.Tensor"(%8356, %8437, %8438) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8440 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %8441 = "torch.prims.convert_element_type"(%8439, %8440) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8442 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8443 = "torch.aten.pow.Tensor_Scalar"(%8441, %8442) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8444 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8445 = "torch.prim.ListConstruct"(%8444) : (!torch.int) -> !torch.list<int>
    %8446 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %8447 = "torch.constant.none"() : () -> !torch.none
    %8448 = "torch.aten.mean.dim"(%8443, %8445, %8446, %8447) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %8449 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %8450 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8451 = "torch.aten.add.Scalar"(%8448, %8449, %8450) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %8452 = "torch.aten.rsqrt"(%8451) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %8453 = "torch.aten.mul.Tensor"(%8441, %8452) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8454 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8455 = "torch.prims.convert_element_type"(%8453, %8454) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8456 = "torch.aten.mul.Tensor"(%718, %8455) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %8457 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8458 = "torch.prims.convert_element_type"(%8456, %8457) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8459 = "torch.aten.div.Tensor"(%8458, %720) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8460 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8461 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8462 = "torch.aten.clamp"(%8459, %8460, %8461) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8463 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8464 = "torch.prims.convert_element_type"(%8462, %8463) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8465 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8466 = "torch.aten.unsqueeze"(%722, %8465) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %8467 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8468 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8469 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8470 = "torch.prim.ListConstruct"(%8467, %8468, %8469) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8471 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8472 = "torch.aten.expand"(%8466, %8470, %8471) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %8473 = "torch_c.to_builtin_tensor"(%8464) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8474 = "torch_c.to_builtin_tensor"(%8472) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %8475 = "util.call"(%8473, %8474) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %8476 = "torch_c.from_builtin_tensor"(%8475) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %8477 = "torch.aten.div.Tensor"(%8476, %724) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8478 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8479 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8480 = "torch.aten.clamp"(%8477, %8478, %8479) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %8481 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8482 = "torch.prims.convert_element_type"(%8480, %8481) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8483 = "torch.aten.div.Tensor"(%8458, %726) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8484 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8485 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8486 = "torch.aten.clamp"(%8483, %8484, %8485) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8487 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8488 = "torch.prims.convert_element_type"(%8486, %8487) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8489 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8490 = "torch.aten.unsqueeze"(%728, %8489) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %8491 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8492 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %8493 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8494 = "torch.prim.ListConstruct"(%8491, %8492, %8493) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8495 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8496 = "torch.aten.expand"(%8490, %8494, %8495) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %8497 = "torch_c.to_builtin_tensor"(%8488) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8498 = "torch_c.to_builtin_tensor"(%8496) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %8499 = "util.call"(%8497, %8498) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %8500 = "torch_c.from_builtin_tensor"(%8499) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %8501 = "torch.aten.div.Tensor"(%8500, %730) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %8502 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8503 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8504 = "torch.aten.clamp"(%8501, %8502, %8503) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %8505 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8506 = "torch.prims.convert_element_type"(%8504, %8505) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %8507 = "torch.aten.div.Tensor"(%8458, %732) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8508 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8509 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8510 = "torch.aten.clamp"(%8507, %8508, %8509) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8511 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8512 = "torch.prims.convert_element_type"(%8510, %8511) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8513 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8514 = "torch.aten.unsqueeze"(%734, %8513) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %8515 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8516 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %8517 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8518 = "torch.prim.ListConstruct"(%8515, %8516, %8517) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8519 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8520 = "torch.aten.expand"(%8514, %8518, %8519) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %8521 = "torch_c.to_builtin_tensor"(%8512) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8522 = "torch_c.to_builtin_tensor"(%8520) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %8523 = "util.call"(%8521, %8522) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %8524 = "torch_c.from_builtin_tensor"(%8523) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %8525 = "torch.aten.div.Tensor"(%8524, %736) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %8526 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8527 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8528 = "torch.aten.clamp"(%8525, %8526, %8527) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %8529 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8530 = "torch.prims.convert_element_type"(%8528, %8529) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %8531 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8532 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8533 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8534 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8535 = "torch.prim.ListConstruct"(%8531, %8532, %8533, %8534) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8536 = "torch.aten.view"(%8482, %8535) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %8537 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8538 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8539 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8540 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8541 = "torch.prim.ListConstruct"(%8537, %8538, %8539, %8540) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8542 = "torch.aten.view"(%8506, %8541) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %8543 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8544 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8545 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8546 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8547 = "torch.prim.ListConstruct"(%8543, %8544, %8545, %8546) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8548 = "torch.aten.view"(%8530, %8547) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %8549 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8550 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8551 = "torch.aten.transpose.int"(%8536, %8549, %8550) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8552 = "torch.aten.mul.Tensor"(%8551, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8553 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8554 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8555 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8556 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8557 = "torch.aten.slice.Tensor"(%8551, %8553, %8554, %8555, %8556) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %8558 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8559 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8560 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8561 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8562 = "torch.aten.slice.Tensor"(%8551, %8558, %8559, %8560, %8561) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %8563 = "torch.aten.neg"(%8562) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %8564 = "torch.prim.ListConstruct"(%8563, %8557) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %8565 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8566 = "torch.aten.cat"(%8564, %8565) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8567 = "torch.aten.mul.Tensor"(%8566, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8568 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8569 = "torch.aten.add.Tensor"(%8552, %8567, %8568) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8570 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8571 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8572 = "torch.aten.transpose.int"(%8569, %8570, %8571) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %8573 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8574 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8575 = "torch.aten.transpose.int"(%8542, %8573, %8574) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8576 = "torch.aten.mul.Tensor"(%8575, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8577 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8578 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8579 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8580 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8581 = "torch.aten.slice.Tensor"(%8575, %8577, %8578, %8579, %8580) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %8582 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %8583 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8584 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8585 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8586 = "torch.aten.slice.Tensor"(%8575, %8582, %8583, %8584, %8585) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %8587 = "torch.aten.neg"(%8586) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %8588 = "torch.prim.ListConstruct"(%8587, %8581) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %8589 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8590 = "torch.aten.cat"(%8588, %8589) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8591 = "torch.aten.mul.Tensor"(%8590, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8592 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8593 = "torch.aten.add.Tensor"(%8576, %8591, %8592) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %8594 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8595 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8596 = "torch.aten.transpose.int"(%8593, %8594, %8595) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %8597 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8598 = "torch.aten.floor_divide.Scalar"(%arg64, %8597) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8599 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8600 = "torch.aten.unsqueeze"(%8598, %8599) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8601 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8602 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8603 = "torch.aten.gather"(%arg65, %8601, %8600, %8602) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8604 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8605 = "torch.aten.remainder.Scalar"(%arg64, %8604) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8606 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8607 = "torch.aten.unsqueeze"(%8605, %8606) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8608 = "torch.constant.none"() : () -> !torch.none
    %8609 = "torch.aten.clone"(%737, %8608) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %8610 = "torch.aten.detach"(%8609) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8611 = "torch.aten.detach"(%8610) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8612 = "torch.aten.detach"(%8611) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8613 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8614 = "torch.aten.unsqueeze"(%8612, %8613) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %8615 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8616 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8617 = "torch.prim.ListConstruct"(%8615, %8616) : (!torch.int, !torch.int) -> !torch.list<int>
    %8618 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8619 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8620 = "torch.prim.ListConstruct"(%8618, %8619) : (!torch.int, !torch.int) -> !torch.list<int>
    %8621 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8622 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8623 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %8624 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8625 = "torch.aten.empty_strided"(%8617, %8620, %8621, %8622, %8623, %8624) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8626 = "torch.constant.int"() <{value = 14 : i64}> : () -> !torch.int
    %8627 = "torch.aten.fill.Scalar"(%8625, %8626) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8628 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8629 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8630 = "torch.prim.ListConstruct"(%8628, %8629) : (!torch.int, !torch.int) -> !torch.list<int>
    %8631 = "torch.aten.repeat"(%8614, %8630) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %8632 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8633 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8634 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8635 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8636 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8637 = "torch.prim.ListConstruct"(%1483, %8632, %8633, %8634, %8635, %8636) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8638 = "torch.aten.view"(%8208, %8637) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8638, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8639 = "torch.prim.ListConstruct"(%8603, %8627, %8631, %8607) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %8640 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8641 = "torch.aten.index_put"(%8638, %8639, %8596, %8640) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8641, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8642 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %8643 = "torch.prim.ListConstruct"(%1483, %8642) : (!torch.int, !torch.int) -> !torch.list<int>
    %8644 = "torch.aten.view"(%8641, %8643) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8644, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %8645 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8646 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8647 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8648 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8649 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8650 = "torch.prim.ListConstruct"(%1483, %8645, %8646, %8647, %8648, %8649) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8651 = "torch.aten.view"(%8644, %8650) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8651, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8652 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8653 = "torch.aten.floor_divide.Scalar"(%arg64, %8652) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8654 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8655 = "torch.aten.unsqueeze"(%8653, %8654) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8656 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8657 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8658 = "torch.aten.gather"(%arg65, %8656, %8655, %8657) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8659 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8660 = "torch.aten.remainder.Scalar"(%arg64, %8659) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %8661 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8662 = "torch.aten.unsqueeze"(%8660, %8661) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8663 = "torch.constant.none"() : () -> !torch.none
    %8664 = "torch.aten.clone"(%738, %8663) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %8665 = "torch.aten.detach"(%8664) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8666 = "torch.aten.detach"(%8665) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8667 = "torch.aten.detach"(%8666) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %8668 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8669 = "torch.aten.unsqueeze"(%8667, %8668) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %8670 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8671 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8672 = "torch.prim.ListConstruct"(%8670, %8671) : (!torch.int, !torch.int) -> !torch.list<int>
    %8673 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8674 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8675 = "torch.prim.ListConstruct"(%8673, %8674) : (!torch.int, !torch.int) -> !torch.list<int>
    %8676 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8677 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8678 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %8679 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8680 = "torch.aten.empty_strided"(%8672, %8675, %8676, %8677, %8678, %8679) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %8681 = "torch.constant.int"() <{value = 14 : i64}> : () -> !torch.int
    %8682 = "torch.aten.fill.Scalar"(%8680, %8681) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %8683 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8684 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8685 = "torch.prim.ListConstruct"(%8683, %8684) : (!torch.int, !torch.int) -> !torch.list<int>
    %8686 = "torch.aten.repeat"(%8669, %8685) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %8687 = "torch.prim.ListConstruct"(%8658, %8682, %8686, %8662) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %8688 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8689 = "torch.aten.index_put"(%8651, %8687, %8548, %8688) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8689, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8690 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %8691 = "torch.prim.ListConstruct"(%1483, %8690) : (!torch.int, !torch.int) -> !torch.list<int>
    %8692 = "torch.aten.view"(%8689, %8691) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8692, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %8693 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %8694 = "torch.aten.mul.Scalar"(%arg65, %8693) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8694, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8695 = "torch.constant.int"() <{value = 28 : i64}> : () -> !torch.int
    %8696 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8697 = "torch.aten.add.Scalar"(%8694, %8695, %8696) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8697, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8698 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8699 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8700 = "torch.aten.add.Scalar"(%8697, %8698, %8699) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8700, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8701 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %8702 = "torch.aten.view"(%8700, %8701) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%8702, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %8703 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8704 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8705 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8706 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8707 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8708 = "torch.prim.ListConstruct"(%1483, %8703, %8704, %8705, %8706, %8707) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8709 = "torch.aten.view"(%8692, %8708) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8709, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8710 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8711 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8712 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8713 = "torch.prim.ListConstruct"(%1914, %8710, %8711, %8712) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8714 = "torch.aten.view"(%8709, %8713) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8714, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8715 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8716 = "torch.aten.index_select"(%8714, %8715, %8702) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8716, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8717 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8718 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8719 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8720 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8721 = "torch.prim.ListConstruct"(%8717, %1481, %8718, %8719, %8720) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8722 = "torch.aten.view"(%8716, %8721) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8722, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8723 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8724 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8725 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8726 = "torch.prim.ListConstruct"(%8723, %1485, %8724, %8725) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8727 = "torch.aten.view"(%8722, %8726) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8727, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8728 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8729 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8730 = "torch.aten.add.Scalar"(%8697, %8728, %8729) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%8730, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %8731 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %8732 = "torch.aten.view"(%8730, %8731) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%8732, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %8733 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8734 = "torch.aten.index_select"(%8714, %8733, %8732) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8734, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8735 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8736 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8737 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8738 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8739 = "torch.prim.ListConstruct"(%8735, %1481, %8736, %8737, %8738) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8740 = "torch.aten.view"(%8734, %8739) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8740, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8741 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8742 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8743 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8744 = "torch.prim.ListConstruct"(%8741, %1485, %8742, %8743) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8745 = "torch.aten.view"(%8740, %8744) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8745, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8746 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8747 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8748 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8749 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8750 = "torch.aten.slice.Tensor"(%8727, %8746, %8747, %8748, %8749) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8750, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8751 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8752 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8753 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %8754 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8755 = "torch.aten.slice.Tensor"(%8745, %8751, %8752, %8753, %8754) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8755, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8756 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %8757 = "torch.aten.unsqueeze"(%8750, %8756) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8757, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8758 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8759 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8760 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8761 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8762 = "torch.prim.ListConstruct"(%8758, %1485, %8759, %8760, %8761) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8763 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8764 = "torch.aten.expand"(%8757, %8762, %8763) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8764, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8765 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8766 = "torch.aten.clone"(%8764, %8765) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8766, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8767 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8768 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8769 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8770 = "torch.prim.ListConstruct"(%8767, %1485, %8768, %8769) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8771 = "torch.aten._unsafe_view"(%8766, %8770) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8771, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8772 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %8773 = "torch.aten.unsqueeze"(%8755, %8772) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8773, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8774 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8775 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %8776 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8777 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8778 = "torch.prim.ListConstruct"(%8774, %1485, %8775, %8776, %8777) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8779 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8780 = "torch.aten.expand"(%8773, %8778, %8779) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8780, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8781 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8782 = "torch.aten.clone"(%8780, %8781) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8782, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8783 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8784 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %8785 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %8786 = "torch.prim.ListConstruct"(%8783, %1485, %8784, %8785) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8787 = "torch.aten._unsafe_view"(%8782, %8786) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8787, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8788 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8789 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8790 = "torch.aten.transpose.int"(%8572, %8788, %8789) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %8791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8792 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8793 = "torch.aten.transpose.int"(%8771, %8791, %8792) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8793, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8794 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8795 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8796 = "torch.aten.transpose.int"(%8787, %8794, %8795) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8796, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %8797 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8798 = "torch.aten.squeeze.dim"(%1516, %8797) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8798, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %8799 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8800 = "torch.aten.squeeze.dim"(%8798, %8799) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%8800, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %8801 = "torch_c.to_builtin_tensor"(%8790) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %8802 = "tensor.cast"(%8801) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %8803 = "torch_c.to_builtin_tensor"(%8793) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %8804 = "torch_c.to_builtin_tensor"(%8796) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %8805 = "torch_c.to_builtin_tensor"(%8800) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %8806 = "tensor.cast"(%8805) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %8807 = "torch_c.to_builtin_tensor"(%740) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %8808 = "util.call"(%8802, %8803, %8804, %8807, %8806) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %8809 = "tensor.cast"(%8808) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %8810 = "torch_c.from_builtin_tensor"(%8809) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %8811 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8812 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8813 = "torch.aten.transpose.int"(%8810, %8811, %8812) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %8814 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8815 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8816 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8817 = "torch.prim.ListConstruct"(%8814, %8815, %8816) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8818 = "torch.aten.view"(%8813, %8817) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %8819 = "torch.aten.div.Tensor"(%8818, %742) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8820 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8821 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8822 = "torch.aten.clamp"(%8819, %8820, %8821) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %8823 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8824 = "torch.prims.convert_element_type"(%8822, %8823) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8825 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8826 = "torch.aten.unsqueeze"(%744, %8825) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %8827 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8828 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8829 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8830 = "torch.prim.ListConstruct"(%8827, %8828, %8829) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8831 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8832 = "torch.aten.expand"(%8826, %8830, %8831) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %8833 = "torch_c.to_builtin_tensor"(%8824) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8834 = "torch_c.to_builtin_tensor"(%8832) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %8835 = "util.call"(%8833, %8834) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %8836 = "torch_c.from_builtin_tensor"(%8835) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %8837 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8838 = "torch.prims.convert_element_type"(%8836, %8837) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8839 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8840 = "torch.aten.add.Tensor"(%8439, %8838, %8839) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8841 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %8842 = "torch.prims.convert_element_type"(%8840, %8841) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8843 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8844 = "torch.aten.pow.Tensor_Scalar"(%8842, %8843) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8845 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8846 = "torch.prim.ListConstruct"(%8845) : (!torch.int) -> !torch.list<int>
    %8847 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %8848 = "torch.constant.none"() : () -> !torch.none
    %8849 = "torch.aten.mean.dim"(%8844, %8846, %8847, %8848) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %8850 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %8851 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8852 = "torch.aten.add.Scalar"(%8849, %8850, %8851) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %8853 = "torch.aten.rsqrt"(%8852) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %8854 = "torch.aten.mul.Tensor"(%8842, %8853) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8855 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8856 = "torch.prims.convert_element_type"(%8854, %8855) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8857 = "torch.aten.mul.Tensor"(%746, %8856) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %8858 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8859 = "torch.prims.convert_element_type"(%8857, %8858) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8860 = "torch.aten.div.Tensor"(%8859, %748) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8861 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8862 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8863 = "torch.aten.clamp"(%8860, %8861, %8862) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8864 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8865 = "torch.prims.convert_element_type"(%8863, %8864) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8866 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8867 = "torch.aten.unsqueeze"(%750, %8866) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %8868 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8869 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %8870 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8871 = "torch.prim.ListConstruct"(%8868, %8869, %8870) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8872 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8873 = "torch.aten.expand"(%8867, %8871, %8872) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %8874 = "torch_c.to_builtin_tensor"(%8865) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8875 = "torch_c.to_builtin_tensor"(%8873) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %8876 = "util.call"(%8874, %8875) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %8877 = "torch_c.from_builtin_tensor"(%8876) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %8878 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8879 = "torch.prims.convert_element_type"(%8877, %8878) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %8880 = "torch.aten.silu"(%8879) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %8881 = "torch.aten.div.Tensor"(%8859, %752) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8882 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8883 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8884 = "torch.aten.clamp"(%8881, %8882, %8883) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8885 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8886 = "torch.prims.convert_element_type"(%8884, %8885) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8887 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8888 = "torch.aten.unsqueeze"(%754, %8887) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %8889 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8890 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %8891 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8892 = "torch.prim.ListConstruct"(%8889, %8890, %8891) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8893 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8894 = "torch.aten.expand"(%8888, %8892, %8893) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %8895 = "torch_c.to_builtin_tensor"(%8886) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8896 = "torch_c.to_builtin_tensor"(%8894) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %8897 = "util.call"(%8895, %8896) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %8898 = "torch_c.from_builtin_tensor"(%8897) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %8899 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8900 = "torch.prims.convert_element_type"(%8898, %8899) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %8901 = "torch.aten.mul.Tensor"(%8880, %8900) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %8902 = "torch.aten.div.Tensor"(%8901, %756) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %8903 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8904 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8905 = "torch.aten.clamp"(%8902, %8903, %8904) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %8906 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8907 = "torch.prims.convert_element_type"(%8905, %8906) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %8908 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8909 = "torch.aten.unsqueeze"(%758, %8908) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %8910 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8911 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8912 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %8913 = "torch.prim.ListConstruct"(%8910, %8911, %8912) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8914 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8915 = "torch.aten.expand"(%8909, %8913, %8914) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %8916 = "torch_c.to_builtin_tensor"(%8907) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %8917 = "torch_c.to_builtin_tensor"(%8915) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %8918 = "util.call"(%8916, %8917) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %8919 = "torch_c.from_builtin_tensor"(%8918) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %8920 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8921 = "torch.prims.convert_element_type"(%8919, %8920) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8922 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8923 = "torch.aten.add.Tensor"(%8840, %8921, %8922) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8924 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %8925 = "torch.prims.convert_element_type"(%8923, %8924) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8926 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %8927 = "torch.aten.pow.Tensor_Scalar"(%8925, %8926) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %8928 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %8929 = "torch.prim.ListConstruct"(%8928) : (!torch.int) -> !torch.list<int>
    %8930 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %8931 = "torch.constant.none"() : () -> !torch.none
    %8932 = "torch.aten.mean.dim"(%8927, %8929, %8930, %8931) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %8933 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %8934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %8935 = "torch.aten.add.Scalar"(%8932, %8933, %8934) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %8936 = "torch.aten.rsqrt"(%8935) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %8937 = "torch.aten.mul.Tensor"(%8925, %8936) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8938 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8939 = "torch.prims.convert_element_type"(%8937, %8938) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8940 = "torch.aten.mul.Tensor"(%760, %8939) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %8941 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %8942 = "torch.prims.convert_element_type"(%8940, %8941) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %8943 = "torch.aten.div.Tensor"(%8942, %762) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8944 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8945 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8946 = "torch.aten.clamp"(%8943, %8944, %8945) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8947 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8948 = "torch.prims.convert_element_type"(%8946, %8947) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8949 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8950 = "torch.aten.unsqueeze"(%764, %8949) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %8951 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8952 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8953 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8954 = "torch.prim.ListConstruct"(%8951, %8952, %8953) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8955 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8956 = "torch.aten.expand"(%8950, %8954, %8955) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %8957 = "torch_c.to_builtin_tensor"(%8948) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8958 = "torch_c.to_builtin_tensor"(%8956) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %8959 = "util.call"(%8957, %8958) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %8960 = "torch_c.from_builtin_tensor"(%8959) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %8961 = "torch.aten.div.Tensor"(%8960, %766) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %8962 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8963 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8964 = "torch.aten.clamp"(%8961, %8962, %8963) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %8965 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8966 = "torch.prims.convert_element_type"(%8964, %8965) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8967 = "torch.aten.div.Tensor"(%8942, %768) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8968 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8969 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8970 = "torch.aten.clamp"(%8967, %8968, %8969) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8971 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8972 = "torch.prims.convert_element_type"(%8970, %8971) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8973 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8974 = "torch.aten.unsqueeze"(%770, %8973) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %8975 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %8976 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %8977 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %8978 = "torch.prim.ListConstruct"(%8975, %8976, %8977) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8979 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %8980 = "torch.aten.expand"(%8974, %8978, %8979) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %8981 = "torch_c.to_builtin_tensor"(%8972) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %8982 = "torch_c.to_builtin_tensor"(%8980) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %8983 = "util.call"(%8981, %8982) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %8984 = "torch_c.from_builtin_tensor"(%8983) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %8985 = "torch.aten.div.Tensor"(%8984, %772) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %8986 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8987 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8988 = "torch.aten.clamp"(%8985, %8986, %8987) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %8989 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8990 = "torch.prims.convert_element_type"(%8988, %8989) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %8991 = "torch.aten.div.Tensor"(%8942, %774) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %8992 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %8993 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %8994 = "torch.aten.clamp"(%8991, %8992, %8993) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %8995 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %8996 = "torch.prims.convert_element_type"(%8994, %8995) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %8997 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %8998 = "torch.aten.unsqueeze"(%776, %8997) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %8999 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9000 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %9001 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9002 = "torch.prim.ListConstruct"(%8999, %9000, %9001) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9003 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9004 = "torch.aten.expand"(%8998, %9002, %9003) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %9005 = "torch_c.to_builtin_tensor"(%8996) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9006 = "torch_c.to_builtin_tensor"(%9004) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %9007 = "util.call"(%9005, %9006) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %9008 = "torch_c.from_builtin_tensor"(%9007) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %9009 = "torch.aten.div.Tensor"(%9008, %778) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %9010 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9011 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9012 = "torch.aten.clamp"(%9009, %9010, %9011) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %9013 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9014 = "torch.prims.convert_element_type"(%9012, %9013) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %9015 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9016 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9017 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9018 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9019 = "torch.prim.ListConstruct"(%9015, %9016, %9017, %9018) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9020 = "torch.aten.view"(%8966, %9019) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %9021 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9022 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9023 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9024 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9025 = "torch.prim.ListConstruct"(%9021, %9022, %9023, %9024) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9026 = "torch.aten.view"(%8990, %9025) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %9027 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9028 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9029 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9030 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9031 = "torch.prim.ListConstruct"(%9027, %9028, %9029, %9030) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9032 = "torch.aten.view"(%9014, %9031) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %9033 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9034 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9035 = "torch.aten.transpose.int"(%9020, %9033, %9034) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9036 = "torch.aten.mul.Tensor"(%9035, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9037 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9038 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9039 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9040 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9041 = "torch.aten.slice.Tensor"(%9035, %9037, %9038, %9039, %9040) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %9042 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9043 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9044 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9045 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9046 = "torch.aten.slice.Tensor"(%9035, %9042, %9043, %9044, %9045) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %9047 = "torch.aten.neg"(%9046) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %9048 = "torch.prim.ListConstruct"(%9047, %9041) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %9049 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9050 = "torch.aten.cat"(%9048, %9049) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9051 = "torch.aten.mul.Tensor"(%9050, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9052 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9053 = "torch.aten.add.Tensor"(%9036, %9051, %9052) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9054 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9055 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9056 = "torch.aten.transpose.int"(%9053, %9054, %9055) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %9057 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9058 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9059 = "torch.aten.transpose.int"(%9026, %9057, %9058) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9060 = "torch.aten.mul.Tensor"(%9059, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9061 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9062 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9063 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9064 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9065 = "torch.aten.slice.Tensor"(%9059, %9061, %9062, %9063, %9064) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %9066 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9067 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9068 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9069 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9070 = "torch.aten.slice.Tensor"(%9059, %9066, %9067, %9068, %9069) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %9071 = "torch.aten.neg"(%9070) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %9072 = "torch.prim.ListConstruct"(%9071, %9065) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %9073 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9074 = "torch.aten.cat"(%9072, %9073) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9075 = "torch.aten.mul.Tensor"(%9074, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9076 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9077 = "torch.aten.add.Tensor"(%9060, %9075, %9076) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9078 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9079 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9080 = "torch.aten.transpose.int"(%9077, %9078, %9079) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %9081 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9082 = "torch.aten.floor_divide.Scalar"(%arg64, %9081) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9083 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9084 = "torch.aten.unsqueeze"(%9082, %9083) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9085 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9086 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9087 = "torch.aten.gather"(%arg65, %9085, %9084, %9086) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9088 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9089 = "torch.aten.remainder.Scalar"(%arg64, %9088) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9090 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9091 = "torch.aten.unsqueeze"(%9089, %9090) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9092 = "torch.constant.none"() : () -> !torch.none
    %9093 = "torch.aten.clone"(%779, %9092) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %9094 = "torch.aten.detach"(%9093) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9095 = "torch.aten.detach"(%9094) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9096 = "torch.aten.detach"(%9095) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9097 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9098 = "torch.aten.unsqueeze"(%9096, %9097) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %9099 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9100 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9101 = "torch.prim.ListConstruct"(%9099, %9100) : (!torch.int, !torch.int) -> !torch.list<int>
    %9102 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9103 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9104 = "torch.prim.ListConstruct"(%9102, %9103) : (!torch.int, !torch.int) -> !torch.list<int>
    %9105 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9106 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9107 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %9108 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9109 = "torch.aten.empty_strided"(%9101, %9104, %9105, %9106, %9107, %9108) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9110 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9111 = "torch.aten.fill.Scalar"(%9109, %9110) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9112 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9113 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9114 = "torch.prim.ListConstruct"(%9112, %9113) : (!torch.int, !torch.int) -> !torch.list<int>
    %9115 = "torch.aten.repeat"(%9098, %9114) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %9116 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9117 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9118 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9119 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9120 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9121 = "torch.prim.ListConstruct"(%1483, %9116, %9117, %9118, %9119, %9120) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9122 = "torch.aten.view"(%8692, %9121) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9122, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9123 = "torch.prim.ListConstruct"(%9087, %9111, %9115, %9091) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %9124 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9125 = "torch.aten.index_put"(%9122, %9123, %9080, %9124) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9125, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9126 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %9127 = "torch.prim.ListConstruct"(%1483, %9126) : (!torch.int, !torch.int) -> !torch.list<int>
    %9128 = "torch.aten.view"(%9125, %9127) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9128, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %9129 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9130 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9131 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9132 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9133 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9134 = "torch.prim.ListConstruct"(%1483, %9129, %9130, %9131, %9132, %9133) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9135 = "torch.aten.view"(%9128, %9134) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9135, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9136 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9137 = "torch.aten.floor_divide.Scalar"(%arg64, %9136) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9138 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9139 = "torch.aten.unsqueeze"(%9137, %9138) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9140 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9141 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9142 = "torch.aten.gather"(%arg65, %9140, %9139, %9141) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9143 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9144 = "torch.aten.remainder.Scalar"(%arg64, %9143) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9145 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9146 = "torch.aten.unsqueeze"(%9144, %9145) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9147 = "torch.constant.none"() : () -> !torch.none
    %9148 = "torch.aten.clone"(%780, %9147) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %9149 = "torch.aten.detach"(%9148) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9150 = "torch.aten.detach"(%9149) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9151 = "torch.aten.detach"(%9150) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9152 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9153 = "torch.aten.unsqueeze"(%9151, %9152) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %9154 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9155 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9156 = "torch.prim.ListConstruct"(%9154, %9155) : (!torch.int, !torch.int) -> !torch.list<int>
    %9157 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9158 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9159 = "torch.prim.ListConstruct"(%9157, %9158) : (!torch.int, !torch.int) -> !torch.list<int>
    %9160 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9161 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9162 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %9163 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9164 = "torch.aten.empty_strided"(%9156, %9159, %9160, %9161, %9162, %9163) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9165 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9166 = "torch.aten.fill.Scalar"(%9164, %9165) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9167 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9168 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9169 = "torch.prim.ListConstruct"(%9167, %9168) : (!torch.int, !torch.int) -> !torch.list<int>
    %9170 = "torch.aten.repeat"(%9153, %9169) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %9171 = "torch.prim.ListConstruct"(%9142, %9166, %9170, %9146) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %9172 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9173 = "torch.aten.index_put"(%9135, %9171, %9032, %9172) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9173, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9174 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %9175 = "torch.prim.ListConstruct"(%1483, %9174) : (!torch.int, !torch.int) -> !torch.list<int>
    %9176 = "torch.aten.view"(%9173, %9175) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9176, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %9177 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9178 = "torch.aten.mul.Scalar"(%arg65, %9177) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9178, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9179 = "torch.constant.int"() <{value = 30 : i64}> : () -> !torch.int
    %9180 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9181 = "torch.aten.add.Scalar"(%9178, %9179, %9180) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9181, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9182 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9183 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9184 = "torch.aten.add.Scalar"(%9181, %9182, %9183) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9184, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9185 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %9186 = "torch.aten.view"(%9184, %9185) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%9186, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %9187 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9188 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9189 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9190 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9191 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9192 = "torch.prim.ListConstruct"(%1483, %9187, %9188, %9189, %9190, %9191) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9193 = "torch.aten.view"(%9176, %9192) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9193, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9194 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9195 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9196 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9197 = "torch.prim.ListConstruct"(%1914, %9194, %9195, %9196) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9198 = "torch.aten.view"(%9193, %9197) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9198, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9199 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9200 = "torch.aten.index_select"(%9198, %9199, %9186) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9200, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9201 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9202 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9203 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9204 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9205 = "torch.prim.ListConstruct"(%9201, %1481, %9202, %9203, %9204) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9206 = "torch.aten.view"(%9200, %9205) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9206, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9207 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9208 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9209 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9210 = "torch.prim.ListConstruct"(%9207, %1485, %9208, %9209) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9211 = "torch.aten.view"(%9206, %9210) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9211, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9212 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9213 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9214 = "torch.aten.add.Scalar"(%9181, %9212, %9213) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9214, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9215 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %9216 = "torch.aten.view"(%9214, %9215) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%9216, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %9217 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9218 = "torch.aten.index_select"(%9198, %9217, %9216) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9218, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9219 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9220 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9221 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9222 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9223 = "torch.prim.ListConstruct"(%9219, %1481, %9220, %9221, %9222) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9224 = "torch.aten.view"(%9218, %9223) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9224, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9225 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9226 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9227 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9228 = "torch.prim.ListConstruct"(%9225, %1485, %9226, %9227) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9229 = "torch.aten.view"(%9224, %9228) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9229, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9230 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9231 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9232 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9233 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9234 = "torch.aten.slice.Tensor"(%9211, %9230, %9231, %9232, %9233) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9234, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9235 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9237 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9239 = "torch.aten.slice.Tensor"(%9229, %9235, %9236, %9237, %9238) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9239, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9240 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %9241 = "torch.aten.unsqueeze"(%9234, %9240) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9241, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9242 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9243 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9244 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9245 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9246 = "torch.prim.ListConstruct"(%9242, %1485, %9243, %9244, %9245) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9247 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9248 = "torch.aten.expand"(%9241, %9246, %9247) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9248, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9249 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9250 = "torch.aten.clone"(%9248, %9249) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9250, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9251 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9252 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9253 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9254 = "torch.prim.ListConstruct"(%9251, %1485, %9252, %9253) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9255 = "torch.aten._unsafe_view"(%9250, %9254) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9255, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9256 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %9257 = "torch.aten.unsqueeze"(%9239, %9256) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9257, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9258 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9259 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9260 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9261 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9262 = "torch.prim.ListConstruct"(%9258, %1485, %9259, %9260, %9261) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9263 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9264 = "torch.aten.expand"(%9257, %9262, %9263) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9264, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9266 = "torch.aten.clone"(%9264, %9265) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9266, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9267 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9268 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9269 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9270 = "torch.prim.ListConstruct"(%9267, %1485, %9268, %9269) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9271 = "torch.aten._unsafe_view"(%9266, %9270) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9271, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9272 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9273 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9274 = "torch.aten.transpose.int"(%9056, %9272, %9273) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9275 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9276 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9277 = "torch.aten.transpose.int"(%9255, %9275, %9276) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9277, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9278 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9279 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9280 = "torch.aten.transpose.int"(%9271, %9278, %9279) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9280, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9281 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9282 = "torch.aten.squeeze.dim"(%1516, %9281) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9282, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %9283 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9284 = "torch.aten.squeeze.dim"(%9282, %9283) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9284, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %9285 = "torch_c.to_builtin_tensor"(%9274) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %9286 = "tensor.cast"(%9285) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %9287 = "torch_c.to_builtin_tensor"(%9277) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %9288 = "torch_c.to_builtin_tensor"(%9280) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %9289 = "torch_c.to_builtin_tensor"(%9284) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %9290 = "tensor.cast"(%9289) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %9291 = "torch_c.to_builtin_tensor"(%782) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %9292 = "util.call"(%9286, %9287, %9288, %9291, %9290) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %9293 = "tensor.cast"(%9292) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %9294 = "torch_c.from_builtin_tensor"(%9293) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %9295 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9296 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9297 = "torch.aten.transpose.int"(%9294, %9295, %9296) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %9298 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9299 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9300 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9301 = "torch.prim.ListConstruct"(%9298, %9299, %9300) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9302 = "torch.aten.view"(%9297, %9301) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %9303 = "torch.aten.div.Tensor"(%9302, %784) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9304 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9305 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9306 = "torch.aten.clamp"(%9303, %9304, %9305) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %9307 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9308 = "torch.prims.convert_element_type"(%9306, %9307) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9309 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9310 = "torch.aten.unsqueeze"(%786, %9309) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %9311 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9312 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9313 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9314 = "torch.prim.ListConstruct"(%9311, %9312, %9313) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9315 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9316 = "torch.aten.expand"(%9310, %9314, %9315) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %9317 = "torch_c.to_builtin_tensor"(%9308) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9318 = "torch_c.to_builtin_tensor"(%9316) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %9319 = "util.call"(%9317, %9318) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %9320 = "torch_c.from_builtin_tensor"(%9319) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %9321 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9322 = "torch.prims.convert_element_type"(%9320, %9321) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9323 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9324 = "torch.aten.add.Tensor"(%8923, %9322, %9323) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9325 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %9326 = "torch.prims.convert_element_type"(%9324, %9325) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9327 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9328 = "torch.aten.pow.Tensor_Scalar"(%9326, %9327) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9329 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9330 = "torch.prim.ListConstruct"(%9329) : (!torch.int) -> !torch.list<int>
    %9331 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %9332 = "torch.constant.none"() : () -> !torch.none
    %9333 = "torch.aten.mean.dim"(%9328, %9330, %9331, %9332) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %9334 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %9335 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9336 = "torch.aten.add.Scalar"(%9333, %9334, %9335) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %9337 = "torch.aten.rsqrt"(%9336) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %9338 = "torch.aten.mul.Tensor"(%9326, %9337) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9339 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9340 = "torch.prims.convert_element_type"(%9338, %9339) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9341 = "torch.aten.mul.Tensor"(%788, %9340) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %9342 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9343 = "torch.prims.convert_element_type"(%9341, %9342) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9344 = "torch.aten.div.Tensor"(%9343, %790) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9345 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9346 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9347 = "torch.aten.clamp"(%9344, %9345, %9346) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9348 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9349 = "torch.prims.convert_element_type"(%9347, %9348) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9350 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9351 = "torch.aten.unsqueeze"(%792, %9350) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %9352 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9353 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %9354 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9355 = "torch.prim.ListConstruct"(%9352, %9353, %9354) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9356 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9357 = "torch.aten.expand"(%9351, %9355, %9356) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %9358 = "torch_c.to_builtin_tensor"(%9349) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9359 = "torch_c.to_builtin_tensor"(%9357) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %9360 = "util.call"(%9358, %9359) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %9361 = "torch_c.from_builtin_tensor"(%9360) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %9362 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9363 = "torch.prims.convert_element_type"(%9361, %9362) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %9364 = "torch.aten.silu"(%9363) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %9365 = "torch.aten.div.Tensor"(%9343, %794) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9366 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9367 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9368 = "torch.aten.clamp"(%9365, %9366, %9367) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9369 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9370 = "torch.prims.convert_element_type"(%9368, %9369) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9371 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9372 = "torch.aten.unsqueeze"(%796, %9371) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %9373 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9374 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %9375 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9376 = "torch.prim.ListConstruct"(%9373, %9374, %9375) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9377 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9378 = "torch.aten.expand"(%9372, %9376, %9377) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %9379 = "torch_c.to_builtin_tensor"(%9370) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9380 = "torch_c.to_builtin_tensor"(%9378) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %9381 = "util.call"(%9379, %9380) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %9382 = "torch_c.from_builtin_tensor"(%9381) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %9383 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9384 = "torch.prims.convert_element_type"(%9382, %9383) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %9385 = "torch.aten.mul.Tensor"(%9364, %9384) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %9386 = "torch.aten.div.Tensor"(%9385, %798) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %9387 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9388 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9389 = "torch.aten.clamp"(%9386, %9387, %9388) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %9390 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9391 = "torch.prims.convert_element_type"(%9389, %9390) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %9392 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9393 = "torch.aten.unsqueeze"(%800, %9392) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %9394 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9395 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9396 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %9397 = "torch.prim.ListConstruct"(%9394, %9395, %9396) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9398 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9399 = "torch.aten.expand"(%9393, %9397, %9398) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %9400 = "torch_c.to_builtin_tensor"(%9391) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %9401 = "torch_c.to_builtin_tensor"(%9399) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %9402 = "util.call"(%9400, %9401) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %9403 = "torch_c.from_builtin_tensor"(%9402) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %9404 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9405 = "torch.prims.convert_element_type"(%9403, %9404) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9406 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9407 = "torch.aten.add.Tensor"(%9324, %9405, %9406) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9408 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %9409 = "torch.prims.convert_element_type"(%9407, %9408) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9410 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9411 = "torch.aten.pow.Tensor_Scalar"(%9409, %9410) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9412 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9413 = "torch.prim.ListConstruct"(%9412) : (!torch.int) -> !torch.list<int>
    %9414 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %9415 = "torch.constant.none"() : () -> !torch.none
    %9416 = "torch.aten.mean.dim"(%9411, %9413, %9414, %9415) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %9417 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %9418 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9419 = "torch.aten.add.Scalar"(%9416, %9417, %9418) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %9420 = "torch.aten.rsqrt"(%9419) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %9421 = "torch.aten.mul.Tensor"(%9409, %9420) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9422 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9423 = "torch.prims.convert_element_type"(%9421, %9422) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9424 = "torch.aten.mul.Tensor"(%802, %9423) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %9425 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9426 = "torch.prims.convert_element_type"(%9424, %9425) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9427 = "torch.aten.div.Tensor"(%9426, %804) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9428 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9429 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9430 = "torch.aten.clamp"(%9427, %9428, %9429) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9431 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9432 = "torch.prims.convert_element_type"(%9430, %9431) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9433 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9434 = "torch.aten.unsqueeze"(%806, %9433) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %9435 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9436 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9437 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9438 = "torch.prim.ListConstruct"(%9435, %9436, %9437) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9439 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9440 = "torch.aten.expand"(%9434, %9438, %9439) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %9441 = "torch_c.to_builtin_tensor"(%9432) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9442 = "torch_c.to_builtin_tensor"(%9440) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %9443 = "util.call"(%9441, %9442) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %9444 = "torch_c.from_builtin_tensor"(%9443) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %9445 = "torch.aten.div.Tensor"(%9444, %808) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9446 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9447 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9448 = "torch.aten.clamp"(%9445, %9446, %9447) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %9449 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9450 = "torch.prims.convert_element_type"(%9448, %9449) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9451 = "torch.aten.div.Tensor"(%9426, %810) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9452 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9453 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9454 = "torch.aten.clamp"(%9451, %9452, %9453) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9455 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9456 = "torch.prims.convert_element_type"(%9454, %9455) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9457 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9458 = "torch.aten.unsqueeze"(%812, %9457) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %9459 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9460 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %9461 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9462 = "torch.prim.ListConstruct"(%9459, %9460, %9461) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9463 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9464 = "torch.aten.expand"(%9458, %9462, %9463) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %9465 = "torch_c.to_builtin_tensor"(%9456) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9466 = "torch_c.to_builtin_tensor"(%9464) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %9467 = "util.call"(%9465, %9466) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %9468 = "torch_c.from_builtin_tensor"(%9467) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %9469 = "torch.aten.div.Tensor"(%9468, %814) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %9470 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9471 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9472 = "torch.aten.clamp"(%9469, %9470, %9471) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %9473 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9474 = "torch.prims.convert_element_type"(%9472, %9473) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %9475 = "torch.aten.div.Tensor"(%9426, %816) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9476 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9477 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9478 = "torch.aten.clamp"(%9475, %9476, %9477) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9479 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9480 = "torch.prims.convert_element_type"(%9478, %9479) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9481 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9482 = "torch.aten.unsqueeze"(%818, %9481) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %9483 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9484 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %9485 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9486 = "torch.prim.ListConstruct"(%9483, %9484, %9485) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9487 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9488 = "torch.aten.expand"(%9482, %9486, %9487) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %9489 = "torch_c.to_builtin_tensor"(%9480) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9490 = "torch_c.to_builtin_tensor"(%9488) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %9491 = "util.call"(%9489, %9490) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %9492 = "torch_c.from_builtin_tensor"(%9491) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %9493 = "torch.aten.div.Tensor"(%9492, %820) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %9494 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9495 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9496 = "torch.aten.clamp"(%9493, %9494, %9495) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %9497 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9498 = "torch.prims.convert_element_type"(%9496, %9497) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %9499 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9500 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9501 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9502 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9503 = "torch.prim.ListConstruct"(%9499, %9500, %9501, %9502) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9504 = "torch.aten.view"(%9450, %9503) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %9505 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9507 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9508 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9509 = "torch.prim.ListConstruct"(%9505, %9506, %9507, %9508) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9510 = "torch.aten.view"(%9474, %9509) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %9511 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9512 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9513 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9514 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9515 = "torch.prim.ListConstruct"(%9511, %9512, %9513, %9514) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9516 = "torch.aten.view"(%9498, %9515) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %9517 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9518 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9519 = "torch.aten.transpose.int"(%9504, %9517, %9518) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9520 = "torch.aten.mul.Tensor"(%9519, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9521 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9522 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9523 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9524 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9525 = "torch.aten.slice.Tensor"(%9519, %9521, %9522, %9523, %9524) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %9526 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9527 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9528 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9529 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9530 = "torch.aten.slice.Tensor"(%9519, %9526, %9527, %9528, %9529) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %9531 = "torch.aten.neg"(%9530) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %9532 = "torch.prim.ListConstruct"(%9531, %9525) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %9533 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9534 = "torch.aten.cat"(%9532, %9533) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9535 = "torch.aten.mul.Tensor"(%9534, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9537 = "torch.aten.add.Tensor"(%9520, %9535, %9536) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9538 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9539 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9540 = "torch.aten.transpose.int"(%9537, %9538, %9539) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %9541 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9542 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9543 = "torch.aten.transpose.int"(%9510, %9541, %9542) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9544 = "torch.aten.mul.Tensor"(%9543, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9545 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9546 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9547 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9548 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9549 = "torch.aten.slice.Tensor"(%9543, %9545, %9546, %9547, %9548) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %9550 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %9551 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9552 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9553 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9554 = "torch.aten.slice.Tensor"(%9543, %9550, %9551, %9552, %9553) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %9555 = "torch.aten.neg"(%9554) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %9556 = "torch.prim.ListConstruct"(%9555, %9549) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %9557 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9558 = "torch.aten.cat"(%9556, %9557) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9559 = "torch.aten.mul.Tensor"(%9558, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9560 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9561 = "torch.aten.add.Tensor"(%9544, %9559, %9560) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %9562 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9563 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9564 = "torch.aten.transpose.int"(%9561, %9562, %9563) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %9565 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9566 = "torch.aten.floor_divide.Scalar"(%arg64, %9565) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9567 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9568 = "torch.aten.unsqueeze"(%9566, %9567) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9569 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9570 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9571 = "torch.aten.gather"(%arg65, %9569, %9568, %9570) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9572 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9573 = "torch.aten.remainder.Scalar"(%arg64, %9572) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9574 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9575 = "torch.aten.unsqueeze"(%9573, %9574) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9576 = "torch.constant.none"() : () -> !torch.none
    %9577 = "torch.aten.clone"(%821, %9576) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %9578 = "torch.aten.detach"(%9577) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9579 = "torch.aten.detach"(%9578) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9580 = "torch.aten.detach"(%9579) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9581 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9582 = "torch.aten.unsqueeze"(%9580, %9581) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %9583 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9584 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9585 = "torch.prim.ListConstruct"(%9583, %9584) : (!torch.int, !torch.int) -> !torch.list<int>
    %9586 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9587 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9588 = "torch.prim.ListConstruct"(%9586, %9587) : (!torch.int, !torch.int) -> !torch.list<int>
    %9589 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9590 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9591 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %9592 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9593 = "torch.aten.empty_strided"(%9585, %9588, %9589, %9590, %9591, %9592) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9594 = "torch.constant.int"() <{value = 16 : i64}> : () -> !torch.int
    %9595 = "torch.aten.fill.Scalar"(%9593, %9594) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9596 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9597 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9598 = "torch.prim.ListConstruct"(%9596, %9597) : (!torch.int, !torch.int) -> !torch.list<int>
    %9599 = "torch.aten.repeat"(%9582, %9598) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %9600 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9601 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9602 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9603 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9604 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9605 = "torch.prim.ListConstruct"(%1483, %9600, %9601, %9602, %9603, %9604) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9606 = "torch.aten.view"(%9176, %9605) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9606, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9607 = "torch.prim.ListConstruct"(%9571, %9595, %9599, %9575) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %9608 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9609 = "torch.aten.index_put"(%9606, %9607, %9564, %9608) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9609, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9610 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %9611 = "torch.prim.ListConstruct"(%1483, %9610) : (!torch.int, !torch.int) -> !torch.list<int>
    %9612 = "torch.aten.view"(%9609, %9611) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9612, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %9613 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9614 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9615 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9616 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9617 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9618 = "torch.prim.ListConstruct"(%1483, %9613, %9614, %9615, %9616, %9617) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9619 = "torch.aten.view"(%9612, %9618) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9619, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9620 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9621 = "torch.aten.floor_divide.Scalar"(%arg64, %9620) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9622 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9623 = "torch.aten.unsqueeze"(%9621, %9622) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9624 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9625 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9626 = "torch.aten.gather"(%arg65, %9624, %9623, %9625) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9627 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9628 = "torch.aten.remainder.Scalar"(%arg64, %9627) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %9629 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9630 = "torch.aten.unsqueeze"(%9628, %9629) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9631 = "torch.constant.none"() : () -> !torch.none
    %9632 = "torch.aten.clone"(%822, %9631) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %9633 = "torch.aten.detach"(%9632) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9634 = "torch.aten.detach"(%9633) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9635 = "torch.aten.detach"(%9634) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %9636 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9637 = "torch.aten.unsqueeze"(%9635, %9636) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %9638 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9639 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9640 = "torch.prim.ListConstruct"(%9638, %9639) : (!torch.int, !torch.int) -> !torch.list<int>
    %9641 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9642 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9643 = "torch.prim.ListConstruct"(%9641, %9642) : (!torch.int, !torch.int) -> !torch.list<int>
    %9644 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9645 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9646 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %9647 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9648 = "torch.aten.empty_strided"(%9640, %9643, %9644, %9645, %9646, %9647) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %9649 = "torch.constant.int"() <{value = 16 : i64}> : () -> !torch.int
    %9650 = "torch.aten.fill.Scalar"(%9648, %9649) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %9651 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9652 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9653 = "torch.prim.ListConstruct"(%9651, %9652) : (!torch.int, !torch.int) -> !torch.list<int>
    %9654 = "torch.aten.repeat"(%9637, %9653) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %9655 = "torch.prim.ListConstruct"(%9626, %9650, %9654, %9630) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %9656 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9657 = "torch.aten.index_put"(%9619, %9655, %9516, %9656) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9657, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9658 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %9659 = "torch.prim.ListConstruct"(%1483, %9658) : (!torch.int, !torch.int) -> !torch.list<int>
    %9660 = "torch.aten.view"(%9657, %9659) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9660, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %9661 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %9662 = "torch.aten.mul.Scalar"(%arg65, %9661) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9662, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9663 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9664 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9665 = "torch.aten.add.Scalar"(%9662, %9663, %9664) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9665, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9666 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9667 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9668 = "torch.aten.add.Scalar"(%9665, %9666, %9667) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9668, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9669 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %9670 = "torch.aten.view"(%9668, %9669) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%9670, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %9671 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9672 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9673 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9674 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9675 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9676 = "torch.prim.ListConstruct"(%1483, %9671, %9672, %9673, %9674, %9675) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9677 = "torch.aten.view"(%9660, %9676) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9677, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9678 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9679 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9680 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9681 = "torch.prim.ListConstruct"(%1914, %9678, %9679, %9680) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9682 = "torch.aten.view"(%9677, %9681) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9682, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9683 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9684 = "torch.aten.index_select"(%9682, %9683, %9670) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9684, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9685 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9686 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9687 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9688 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9689 = "torch.prim.ListConstruct"(%9685, %1481, %9686, %9687, %9688) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9690 = "torch.aten.view"(%9684, %9689) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9690, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9691 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9692 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9693 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9694 = "torch.prim.ListConstruct"(%9691, %1485, %9692, %9693) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9695 = "torch.aten.view"(%9690, %9694) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9695, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9696 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9697 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9698 = "torch.aten.add.Scalar"(%9665, %9696, %9697) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%9698, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %9699 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %9700 = "torch.aten.view"(%9698, %9699) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%9700, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %9701 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9702 = "torch.aten.index_select"(%9682, %9701, %9700) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9702, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9703 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9704 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9705 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9706 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9707 = "torch.prim.ListConstruct"(%9703, %1481, %9704, %9705, %9706) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9708 = "torch.aten.view"(%9702, %9707) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9708, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9709 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9710 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9711 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9712 = "torch.prim.ListConstruct"(%9709, %1485, %9710, %9711) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9713 = "torch.aten.view"(%9708, %9712) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9713, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9714 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9715 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9716 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9717 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9718 = "torch.aten.slice.Tensor"(%9695, %9714, %9715, %9716, %9717) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9718, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9719 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9720 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9721 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %9722 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9723 = "torch.aten.slice.Tensor"(%9713, %9719, %9720, %9721, %9722) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9723, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9724 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %9725 = "torch.aten.unsqueeze"(%9718, %9724) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9725, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9726 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9727 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9728 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9729 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9730 = "torch.prim.ListConstruct"(%9726, %1485, %9727, %9728, %9729) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9731 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9732 = "torch.aten.expand"(%9725, %9730, %9731) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9732, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9733 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9734 = "torch.aten.clone"(%9732, %9733) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9734, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9735 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9736 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9737 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9738 = "torch.prim.ListConstruct"(%9735, %1485, %9736, %9737) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9739 = "torch.aten._unsafe_view"(%9734, %9738) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9739, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9740 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %9741 = "torch.aten.unsqueeze"(%9723, %9740) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9741, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9742 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9743 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9744 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9745 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9746 = "torch.prim.ListConstruct"(%9742, %1485, %9743, %9744, %9745) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9747 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9748 = "torch.aten.expand"(%9741, %9746, %9747) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9748, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9749 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9750 = "torch.aten.clone"(%9748, %9749) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9750, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9751 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9752 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9753 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9754 = "torch.prim.ListConstruct"(%9751, %1485, %9752, %9753) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9755 = "torch.aten._unsafe_view"(%9750, %9754) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9755, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9756 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9757 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9758 = "torch.aten.transpose.int"(%9540, %9756, %9757) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %9759 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9760 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9761 = "torch.aten.transpose.int"(%9739, %9759, %9760) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9761, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9763 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9764 = "torch.aten.transpose.int"(%9755, %9762, %9763) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9764, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %9765 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9766 = "torch.aten.squeeze.dim"(%1516, %9765) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9766, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %9767 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9768 = "torch.aten.squeeze.dim"(%9766, %9767) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%9768, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %9769 = "torch_c.to_builtin_tensor"(%9758) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %9770 = "tensor.cast"(%9769) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %9771 = "torch_c.to_builtin_tensor"(%9761) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %9772 = "torch_c.to_builtin_tensor"(%9764) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %9773 = "torch_c.to_builtin_tensor"(%9768) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %9774 = "tensor.cast"(%9773) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %9775 = "torch_c.to_builtin_tensor"(%824) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %9776 = "util.call"(%9770, %9771, %9772, %9775, %9774) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %9777 = "tensor.cast"(%9776) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %9778 = "torch_c.from_builtin_tensor"(%9777) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %9779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9780 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9781 = "torch.aten.transpose.int"(%9778, %9779, %9780) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %9782 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9783 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9784 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9785 = "torch.prim.ListConstruct"(%9782, %9783, %9784) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9786 = "torch.aten.view"(%9781, %9785) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %9787 = "torch.aten.div.Tensor"(%9786, %826) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9788 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9789 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9790 = "torch.aten.clamp"(%9787, %9788, %9789) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %9791 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9792 = "torch.prims.convert_element_type"(%9790, %9791) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9793 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9794 = "torch.aten.unsqueeze"(%828, %9793) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %9795 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9796 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9797 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9798 = "torch.prim.ListConstruct"(%9795, %9796, %9797) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9799 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9800 = "torch.aten.expand"(%9794, %9798, %9799) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %9801 = "torch_c.to_builtin_tensor"(%9792) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9802 = "torch_c.to_builtin_tensor"(%9800) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %9803 = "util.call"(%9801, %9802) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %9804 = "torch_c.from_builtin_tensor"(%9803) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %9805 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9806 = "torch.prims.convert_element_type"(%9804, %9805) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9807 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9808 = "torch.aten.add.Tensor"(%9407, %9806, %9807) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9809 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %9810 = "torch.prims.convert_element_type"(%9808, %9809) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9811 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9812 = "torch.aten.pow.Tensor_Scalar"(%9810, %9811) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9813 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9814 = "torch.prim.ListConstruct"(%9813) : (!torch.int) -> !torch.list<int>
    %9815 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %9816 = "torch.constant.none"() : () -> !torch.none
    %9817 = "torch.aten.mean.dim"(%9812, %9814, %9815, %9816) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %9818 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %9819 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9820 = "torch.aten.add.Scalar"(%9817, %9818, %9819) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %9821 = "torch.aten.rsqrt"(%9820) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %9822 = "torch.aten.mul.Tensor"(%9810, %9821) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9823 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9824 = "torch.prims.convert_element_type"(%9822, %9823) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9825 = "torch.aten.mul.Tensor"(%830, %9824) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %9826 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9827 = "torch.prims.convert_element_type"(%9825, %9826) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9828 = "torch.aten.div.Tensor"(%9827, %832) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9829 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9830 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9831 = "torch.aten.clamp"(%9828, %9829, %9830) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9832 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9833 = "torch.prims.convert_element_type"(%9831, %9832) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9834 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9835 = "torch.aten.unsqueeze"(%834, %9834) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %9836 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9837 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %9838 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9839 = "torch.prim.ListConstruct"(%9836, %9837, %9838) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9840 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9841 = "torch.aten.expand"(%9835, %9839, %9840) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %9842 = "torch_c.to_builtin_tensor"(%9833) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9843 = "torch_c.to_builtin_tensor"(%9841) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %9844 = "util.call"(%9842, %9843) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %9845 = "torch_c.from_builtin_tensor"(%9844) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %9846 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9847 = "torch.prims.convert_element_type"(%9845, %9846) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %9848 = "torch.aten.silu"(%9847) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %9849 = "torch.aten.div.Tensor"(%9827, %836) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9850 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9851 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9852 = "torch.aten.clamp"(%9849, %9850, %9851) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9853 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9854 = "torch.prims.convert_element_type"(%9852, %9853) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9855 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9856 = "torch.aten.unsqueeze"(%838, %9855) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %9857 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9858 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %9859 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9860 = "torch.prim.ListConstruct"(%9857, %9858, %9859) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9861 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9862 = "torch.aten.expand"(%9856, %9860, %9861) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %9863 = "torch_c.to_builtin_tensor"(%9854) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9864 = "torch_c.to_builtin_tensor"(%9862) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %9865 = "util.call"(%9863, %9864) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %9866 = "torch_c.from_builtin_tensor"(%9865) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %9867 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9868 = "torch.prims.convert_element_type"(%9866, %9867) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %9869 = "torch.aten.mul.Tensor"(%9848, %9868) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %9870 = "torch.aten.div.Tensor"(%9869, %840) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %9871 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9872 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9873 = "torch.aten.clamp"(%9870, %9871, %9872) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %9874 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9875 = "torch.prims.convert_element_type"(%9873, %9874) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %9876 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9877 = "torch.aten.unsqueeze"(%842, %9876) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %9878 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9879 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9880 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %9881 = "torch.prim.ListConstruct"(%9878, %9879, %9880) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9882 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9883 = "torch.aten.expand"(%9877, %9881, %9882) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %9884 = "torch_c.to_builtin_tensor"(%9875) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %9885 = "torch_c.to_builtin_tensor"(%9883) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %9886 = "util.call"(%9884, %9885) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %9887 = "torch_c.from_builtin_tensor"(%9886) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %9888 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9889 = "torch.prims.convert_element_type"(%9887, %9888) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9890 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9891 = "torch.aten.add.Tensor"(%9808, %9889, %9890) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9892 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %9893 = "torch.prims.convert_element_type"(%9891, %9892) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9894 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %9895 = "torch.aten.pow.Tensor_Scalar"(%9893, %9894) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %9896 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %9897 = "torch.prim.ListConstruct"(%9896) : (!torch.int) -> !torch.list<int>
    %9898 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %9899 = "torch.constant.none"() : () -> !torch.none
    %9900 = "torch.aten.mean.dim"(%9895, %9897, %9898, %9899) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %9901 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %9902 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9903 = "torch.aten.add.Scalar"(%9900, %9901, %9902) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %9904 = "torch.aten.rsqrt"(%9903) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %9905 = "torch.aten.mul.Tensor"(%9893, %9904) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9906 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9907 = "torch.prims.convert_element_type"(%9905, %9906) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9908 = "torch.aten.mul.Tensor"(%844, %9907) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %9909 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %9910 = "torch.prims.convert_element_type"(%9908, %9909) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %9911 = "torch.aten.div.Tensor"(%9910, %846) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9912 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9913 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9914 = "torch.aten.clamp"(%9911, %9912, %9913) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9915 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9916 = "torch.prims.convert_element_type"(%9914, %9915) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9917 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9918 = "torch.aten.unsqueeze"(%848, %9917) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %9919 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9920 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9921 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9922 = "torch.prim.ListConstruct"(%9919, %9920, %9921) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9923 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9924 = "torch.aten.expand"(%9918, %9922, %9923) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %9925 = "torch_c.to_builtin_tensor"(%9916) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9926 = "torch_c.to_builtin_tensor"(%9924) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %9927 = "util.call"(%9925, %9926) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %9928 = "torch_c.from_builtin_tensor"(%9927) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %9929 = "torch.aten.div.Tensor"(%9928, %850) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %9930 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9931 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9932 = "torch.aten.clamp"(%9929, %9930, %9931) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %9933 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9934 = "torch.prims.convert_element_type"(%9932, %9933) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9935 = "torch.aten.div.Tensor"(%9910, %852) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9936 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9937 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9938 = "torch.aten.clamp"(%9935, %9936, %9937) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9939 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9940 = "torch.prims.convert_element_type"(%9938, %9939) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9941 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9942 = "torch.aten.unsqueeze"(%854, %9941) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %9943 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9944 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %9945 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9946 = "torch.prim.ListConstruct"(%9943, %9944, %9945) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9947 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9948 = "torch.aten.expand"(%9942, %9946, %9947) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %9949 = "torch_c.to_builtin_tensor"(%9940) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9950 = "torch_c.to_builtin_tensor"(%9948) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %9951 = "util.call"(%9949, %9950) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %9952 = "torch_c.from_builtin_tensor"(%9951) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %9953 = "torch.aten.div.Tensor"(%9952, %856) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %9954 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9955 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9956 = "torch.aten.clamp"(%9953, %9954, %9955) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %9957 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9958 = "torch.prims.convert_element_type"(%9956, %9957) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %9959 = "torch.aten.div.Tensor"(%9910, %858) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %9960 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9961 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9962 = "torch.aten.clamp"(%9959, %9960, %9961) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %9963 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9964 = "torch.prims.convert_element_type"(%9962, %9963) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %9965 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %9966 = "torch.aten.unsqueeze"(%860, %9965) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %9967 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9968 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %9969 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %9970 = "torch.prim.ListConstruct"(%9967, %9968, %9969) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9971 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %9972 = "torch.aten.expand"(%9966, %9970, %9971) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %9973 = "torch_c.to_builtin_tensor"(%9964) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %9974 = "torch_c.to_builtin_tensor"(%9972) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %9975 = "util.call"(%9973, %9974) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %9976 = "torch_c.from_builtin_tensor"(%9975) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %9977 = "torch.aten.div.Tensor"(%9976, %862) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %9978 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %9979 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %9980 = "torch.aten.clamp"(%9977, %9978, %9979) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %9981 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %9982 = "torch.prims.convert_element_type"(%9980, %9981) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %9983 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9984 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9985 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %9986 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9987 = "torch.prim.ListConstruct"(%9983, %9984, %9985, %9986) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9988 = "torch.aten.view"(%9934, %9987) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %9989 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9990 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9991 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9992 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9993 = "torch.prim.ListConstruct"(%9989, %9990, %9991, %9992) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9994 = "torch.aten.view"(%9958, %9993) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %9995 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %9996 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %9997 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %9998 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %9999 = "torch.prim.ListConstruct"(%9995, %9996, %9997, %9998) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10000 = "torch.aten.view"(%9982, %9999) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %10001 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10002 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10003 = "torch.aten.transpose.int"(%9988, %10001, %10002) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10004 = "torch.aten.mul.Tensor"(%10003, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10005 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10006 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10007 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10008 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10009 = "torch.aten.slice.Tensor"(%10003, %10005, %10006, %10007, %10008) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10010 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10011 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10012 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10013 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10014 = "torch.aten.slice.Tensor"(%10003, %10010, %10011, %10012, %10013) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10015 = "torch.aten.neg"(%10014) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10016 = "torch.prim.ListConstruct"(%10015, %10009) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %10017 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10018 = "torch.aten.cat"(%10016, %10017) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10019 = "torch.aten.mul.Tensor"(%10018, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10020 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10021 = "torch.aten.add.Tensor"(%10004, %10019, %10020) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10022 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10023 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10024 = "torch.aten.transpose.int"(%10021, %10022, %10023) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %10025 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10026 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10027 = "torch.aten.transpose.int"(%9994, %10025, %10026) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10028 = "torch.aten.mul.Tensor"(%10027, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10029 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10030 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10031 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10032 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10033 = "torch.aten.slice.Tensor"(%10027, %10029, %10030, %10031, %10032) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %10034 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10035 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10036 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10037 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10038 = "torch.aten.slice.Tensor"(%10027, %10034, %10035, %10036, %10037) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %10039 = "torch.aten.neg"(%10038) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %10040 = "torch.prim.ListConstruct"(%10039, %10033) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %10041 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10042 = "torch.aten.cat"(%10040, %10041) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10043 = "torch.aten.mul.Tensor"(%10042, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10044 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10045 = "torch.aten.add.Tensor"(%10028, %10043, %10044) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10046 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10047 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10048 = "torch.aten.transpose.int"(%10045, %10046, %10047) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %10049 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10050 = "torch.aten.floor_divide.Scalar"(%arg64, %10049) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10051 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10052 = "torch.aten.unsqueeze"(%10050, %10051) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10053 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10054 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10055 = "torch.aten.gather"(%arg65, %10053, %10052, %10054) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10056 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10057 = "torch.aten.remainder.Scalar"(%arg64, %10056) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10058 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10059 = "torch.aten.unsqueeze"(%10057, %10058) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10060 = "torch.constant.none"() : () -> !torch.none
    %10061 = "torch.aten.clone"(%863, %10060) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %10062 = "torch.aten.detach"(%10061) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10063 = "torch.aten.detach"(%10062) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10064 = "torch.aten.detach"(%10063) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10065 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10066 = "torch.aten.unsqueeze"(%10064, %10065) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %10067 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10068 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10069 = "torch.prim.ListConstruct"(%10067, %10068) : (!torch.int, !torch.int) -> !torch.list<int>
    %10070 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10071 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10072 = "torch.prim.ListConstruct"(%10070, %10071) : (!torch.int, !torch.int) -> !torch.list<int>
    %10073 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10074 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10075 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %10076 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10077 = "torch.aten.empty_strided"(%10069, %10072, %10073, %10074, %10075, %10076) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10078 = "torch.constant.int"() <{value = 17 : i64}> : () -> !torch.int
    %10079 = "torch.aten.fill.Scalar"(%10077, %10078) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10080 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10081 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10082 = "torch.prim.ListConstruct"(%10080, %10081) : (!torch.int, !torch.int) -> !torch.list<int>
    %10083 = "torch.aten.repeat"(%10066, %10082) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %10084 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10085 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10086 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10087 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10088 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10089 = "torch.prim.ListConstruct"(%1483, %10084, %10085, %10086, %10087, %10088) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10090 = "torch.aten.view"(%9660, %10089) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10090, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10091 = "torch.prim.ListConstruct"(%10055, %10079, %10083, %10059) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %10092 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10093 = "torch.aten.index_put"(%10090, %10091, %10048, %10092) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10093, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10094 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %10095 = "torch.prim.ListConstruct"(%1483, %10094) : (!torch.int, !torch.int) -> !torch.list<int>
    %10096 = "torch.aten.view"(%10093, %10095) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10096, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %10097 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10098 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10099 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10100 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10101 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10102 = "torch.prim.ListConstruct"(%1483, %10097, %10098, %10099, %10100, %10101) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10103 = "torch.aten.view"(%10096, %10102) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10103, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10104 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10105 = "torch.aten.floor_divide.Scalar"(%arg64, %10104) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10106 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10107 = "torch.aten.unsqueeze"(%10105, %10106) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10108 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10109 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10110 = "torch.aten.gather"(%arg65, %10108, %10107, %10109) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10111 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10112 = "torch.aten.remainder.Scalar"(%arg64, %10111) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10113 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10114 = "torch.aten.unsqueeze"(%10112, %10113) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10115 = "torch.constant.none"() : () -> !torch.none
    %10116 = "torch.aten.clone"(%864, %10115) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %10117 = "torch.aten.detach"(%10116) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10118 = "torch.aten.detach"(%10117) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10119 = "torch.aten.detach"(%10118) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10120 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10121 = "torch.aten.unsqueeze"(%10119, %10120) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %10122 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10123 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10124 = "torch.prim.ListConstruct"(%10122, %10123) : (!torch.int, !torch.int) -> !torch.list<int>
    %10125 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10126 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10127 = "torch.prim.ListConstruct"(%10125, %10126) : (!torch.int, !torch.int) -> !torch.list<int>
    %10128 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10129 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10130 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %10131 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10132 = "torch.aten.empty_strided"(%10124, %10127, %10128, %10129, %10130, %10131) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10133 = "torch.constant.int"() <{value = 17 : i64}> : () -> !torch.int
    %10134 = "torch.aten.fill.Scalar"(%10132, %10133) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10135 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10136 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10137 = "torch.prim.ListConstruct"(%10135, %10136) : (!torch.int, !torch.int) -> !torch.list<int>
    %10138 = "torch.aten.repeat"(%10121, %10137) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %10139 = "torch.prim.ListConstruct"(%10110, %10134, %10138, %10114) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %10140 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10141 = "torch.aten.index_put"(%10103, %10139, %10000, %10140) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10141, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10142 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %10143 = "torch.prim.ListConstruct"(%1483, %10142) : (!torch.int, !torch.int) -> !torch.list<int>
    %10144 = "torch.aten.view"(%10141, %10143) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10144, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %10145 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10146 = "torch.aten.mul.Scalar"(%arg65, %10145) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10146, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10147 = "torch.constant.int"() <{value = 34 : i64}> : () -> !torch.int
    %10148 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10149 = "torch.aten.add.Scalar"(%10146, %10147, %10148) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10149, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10150 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10151 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10152 = "torch.aten.add.Scalar"(%10149, %10150, %10151) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10152, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10153 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %10154 = "torch.aten.view"(%10152, %10153) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%10154, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %10155 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10156 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10157 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10158 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10159 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10160 = "torch.prim.ListConstruct"(%1483, %10155, %10156, %10157, %10158, %10159) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10161 = "torch.aten.view"(%10144, %10160) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10161, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10162 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10163 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10164 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10165 = "torch.prim.ListConstruct"(%1914, %10162, %10163, %10164) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10166 = "torch.aten.view"(%10161, %10165) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10166, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10167 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10168 = "torch.aten.index_select"(%10166, %10167, %10154) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10168, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10169 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10170 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10171 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10172 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10173 = "torch.prim.ListConstruct"(%10169, %1481, %10170, %10171, %10172) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10174 = "torch.aten.view"(%10168, %10173) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10174, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10175 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10176 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10177 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10178 = "torch.prim.ListConstruct"(%10175, %1485, %10176, %10177) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10179 = "torch.aten.view"(%10174, %10178) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10179, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10180 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10181 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10182 = "torch.aten.add.Scalar"(%10149, %10180, %10181) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10182, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10183 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %10184 = "torch.aten.view"(%10182, %10183) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%10184, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %10185 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10186 = "torch.aten.index_select"(%10166, %10185, %10184) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10186, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10187 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10188 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10189 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10190 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10191 = "torch.prim.ListConstruct"(%10187, %1481, %10188, %10189, %10190) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10192 = "torch.aten.view"(%10186, %10191) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10192, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10193 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10194 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10195 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10196 = "torch.prim.ListConstruct"(%10193, %1485, %10194, %10195) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10197 = "torch.aten.view"(%10192, %10196) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10197, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10198 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10199 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10200 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10201 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10202 = "torch.aten.slice.Tensor"(%10179, %10198, %10199, %10200, %10201) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10202, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10203 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10204 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10205 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10206 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10207 = "torch.aten.slice.Tensor"(%10197, %10203, %10204, %10205, %10206) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10207, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10208 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %10209 = "torch.aten.unsqueeze"(%10202, %10208) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10209, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10210 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10211 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10212 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10213 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10214 = "torch.prim.ListConstruct"(%10210, %1485, %10211, %10212, %10213) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10215 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10216 = "torch.aten.expand"(%10209, %10214, %10215) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10216, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10217 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10218 = "torch.aten.clone"(%10216, %10217) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10218, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10219 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10220 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10221 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10222 = "torch.prim.ListConstruct"(%10219, %1485, %10220, %10221) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10223 = "torch.aten._unsafe_view"(%10218, %10222) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10223, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10224 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %10225 = "torch.aten.unsqueeze"(%10207, %10224) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10225, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10226 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10227 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10228 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10229 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10230 = "torch.prim.ListConstruct"(%10226, %1485, %10227, %10228, %10229) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10231 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10232 = "torch.aten.expand"(%10225, %10230, %10231) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10232, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10233 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10234 = "torch.aten.clone"(%10232, %10233) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10234, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10235 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10236 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10237 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10238 = "torch.prim.ListConstruct"(%10235, %1485, %10236, %10237) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10239 = "torch.aten._unsafe_view"(%10234, %10238) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10239, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10240 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10241 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10242 = "torch.aten.transpose.int"(%10024, %10240, %10241) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10243 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10244 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10245 = "torch.aten.transpose.int"(%10223, %10243, %10244) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10245, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10246 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10247 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10248 = "torch.aten.transpose.int"(%10239, %10246, %10247) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10248, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10249 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10250 = "torch.aten.squeeze.dim"(%1516, %10249) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10250, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %10251 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10252 = "torch.aten.squeeze.dim"(%10250, %10251) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10252, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %10253 = "torch_c.to_builtin_tensor"(%10242) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %10254 = "tensor.cast"(%10253) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %10255 = "torch_c.to_builtin_tensor"(%10245) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %10256 = "torch_c.to_builtin_tensor"(%10248) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %10257 = "torch_c.to_builtin_tensor"(%10252) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %10258 = "tensor.cast"(%10257) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %10259 = "torch_c.to_builtin_tensor"(%866) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %10260 = "util.call"(%10254, %10255, %10256, %10259, %10258) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %10261 = "tensor.cast"(%10260) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %10262 = "torch_c.from_builtin_tensor"(%10261) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %10263 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10264 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10265 = "torch.aten.transpose.int"(%10262, %10263, %10264) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %10266 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10267 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10268 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10269 = "torch.prim.ListConstruct"(%10266, %10267, %10268) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10270 = "torch.aten.view"(%10265, %10269) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %10271 = "torch.aten.div.Tensor"(%10270, %868) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10272 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10273 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10274 = "torch.aten.clamp"(%10271, %10272, %10273) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %10275 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10276 = "torch.prims.convert_element_type"(%10274, %10275) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10277 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10278 = "torch.aten.unsqueeze"(%870, %10277) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %10279 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10280 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10281 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10282 = "torch.prim.ListConstruct"(%10279, %10280, %10281) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10283 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10284 = "torch.aten.expand"(%10278, %10282, %10283) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %10285 = "torch_c.to_builtin_tensor"(%10276) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10286 = "torch_c.to_builtin_tensor"(%10284) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %10287 = "util.call"(%10285, %10286) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %10288 = "torch_c.from_builtin_tensor"(%10287) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %10289 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10290 = "torch.prims.convert_element_type"(%10288, %10289) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10291 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10292 = "torch.aten.add.Tensor"(%9891, %10290, %10291) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10293 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %10294 = "torch.prims.convert_element_type"(%10292, %10293) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10295 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10296 = "torch.aten.pow.Tensor_Scalar"(%10294, %10295) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10297 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10298 = "torch.prim.ListConstruct"(%10297) : (!torch.int) -> !torch.list<int>
    %10299 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %10300 = "torch.constant.none"() : () -> !torch.none
    %10301 = "torch.aten.mean.dim"(%10296, %10298, %10299, %10300) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %10302 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %10303 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10304 = "torch.aten.add.Scalar"(%10301, %10302, %10303) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %10305 = "torch.aten.rsqrt"(%10304) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %10306 = "torch.aten.mul.Tensor"(%10294, %10305) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10307 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10308 = "torch.prims.convert_element_type"(%10306, %10307) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10309 = "torch.aten.mul.Tensor"(%872, %10308) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %10310 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10311 = "torch.prims.convert_element_type"(%10309, %10310) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10312 = "torch.aten.div.Tensor"(%10311, %874) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10313 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10314 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10315 = "torch.aten.clamp"(%10312, %10313, %10314) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10316 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10317 = "torch.prims.convert_element_type"(%10315, %10316) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10318 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10319 = "torch.aten.unsqueeze"(%876, %10318) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %10320 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10321 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %10322 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10323 = "torch.prim.ListConstruct"(%10320, %10321, %10322) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10324 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10325 = "torch.aten.expand"(%10319, %10323, %10324) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %10326 = "torch_c.to_builtin_tensor"(%10317) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10327 = "torch_c.to_builtin_tensor"(%10325) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %10328 = "util.call"(%10326, %10327) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %10329 = "torch_c.from_builtin_tensor"(%10328) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %10330 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10331 = "torch.prims.convert_element_type"(%10329, %10330) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %10332 = "torch.aten.silu"(%10331) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %10333 = "torch.aten.div.Tensor"(%10311, %878) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10334 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10335 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10336 = "torch.aten.clamp"(%10333, %10334, %10335) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10337 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10338 = "torch.prims.convert_element_type"(%10336, %10337) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10339 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10340 = "torch.aten.unsqueeze"(%880, %10339) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %10341 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10342 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %10343 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10344 = "torch.prim.ListConstruct"(%10341, %10342, %10343) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10345 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10346 = "torch.aten.expand"(%10340, %10344, %10345) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %10347 = "torch_c.to_builtin_tensor"(%10338) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10348 = "torch_c.to_builtin_tensor"(%10346) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %10349 = "util.call"(%10347, %10348) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %10350 = "torch_c.from_builtin_tensor"(%10349) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %10351 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10352 = "torch.prims.convert_element_type"(%10350, %10351) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %10353 = "torch.aten.mul.Tensor"(%10332, %10352) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %10354 = "torch.aten.div.Tensor"(%10353, %882) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %10355 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10356 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10357 = "torch.aten.clamp"(%10354, %10355, %10356) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %10358 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10359 = "torch.prims.convert_element_type"(%10357, %10358) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %10360 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10361 = "torch.aten.unsqueeze"(%884, %10360) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %10362 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10363 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10364 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %10365 = "torch.prim.ListConstruct"(%10362, %10363, %10364) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10366 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10367 = "torch.aten.expand"(%10361, %10365, %10366) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %10368 = "torch_c.to_builtin_tensor"(%10359) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %10369 = "torch_c.to_builtin_tensor"(%10367) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %10370 = "util.call"(%10368, %10369) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %10371 = "torch_c.from_builtin_tensor"(%10370) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %10372 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10373 = "torch.prims.convert_element_type"(%10371, %10372) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10374 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10375 = "torch.aten.add.Tensor"(%10292, %10373, %10374) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10376 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %10377 = "torch.prims.convert_element_type"(%10375, %10376) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10378 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10379 = "torch.aten.pow.Tensor_Scalar"(%10377, %10378) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10380 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10381 = "torch.prim.ListConstruct"(%10380) : (!torch.int) -> !torch.list<int>
    %10382 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %10383 = "torch.constant.none"() : () -> !torch.none
    %10384 = "torch.aten.mean.dim"(%10379, %10381, %10382, %10383) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %10385 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %10386 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10387 = "torch.aten.add.Scalar"(%10384, %10385, %10386) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %10388 = "torch.aten.rsqrt"(%10387) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %10389 = "torch.aten.mul.Tensor"(%10377, %10388) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10390 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10391 = "torch.prims.convert_element_type"(%10389, %10390) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10392 = "torch.aten.mul.Tensor"(%886, %10391) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %10393 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10394 = "torch.prims.convert_element_type"(%10392, %10393) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10395 = "torch.aten.div.Tensor"(%10394, %888) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10396 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10397 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10398 = "torch.aten.clamp"(%10395, %10396, %10397) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10399 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10400 = "torch.prims.convert_element_type"(%10398, %10399) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10401 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10402 = "torch.aten.unsqueeze"(%890, %10401) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %10403 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10404 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10405 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10406 = "torch.prim.ListConstruct"(%10403, %10404, %10405) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10407 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10408 = "torch.aten.expand"(%10402, %10406, %10407) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %10409 = "torch_c.to_builtin_tensor"(%10400) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10410 = "torch_c.to_builtin_tensor"(%10408) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %10411 = "util.call"(%10409, %10410) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %10412 = "torch_c.from_builtin_tensor"(%10411) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %10413 = "torch.aten.div.Tensor"(%10412, %892) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10414 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10415 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10416 = "torch.aten.clamp"(%10413, %10414, %10415) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %10417 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10418 = "torch.prims.convert_element_type"(%10416, %10417) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10419 = "torch.aten.div.Tensor"(%10394, %894) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10420 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10421 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10422 = "torch.aten.clamp"(%10419, %10420, %10421) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10423 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10424 = "torch.prims.convert_element_type"(%10422, %10423) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10425 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10426 = "torch.aten.unsqueeze"(%896, %10425) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %10427 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10428 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %10429 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10430 = "torch.prim.ListConstruct"(%10427, %10428, %10429) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10431 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10432 = "torch.aten.expand"(%10426, %10430, %10431) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %10433 = "torch_c.to_builtin_tensor"(%10424) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10434 = "torch_c.to_builtin_tensor"(%10432) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %10435 = "util.call"(%10433, %10434) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %10436 = "torch_c.from_builtin_tensor"(%10435) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %10437 = "torch.aten.div.Tensor"(%10436, %898) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %10438 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10439 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10440 = "torch.aten.clamp"(%10437, %10438, %10439) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %10441 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10442 = "torch.prims.convert_element_type"(%10440, %10441) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %10443 = "torch.aten.div.Tensor"(%10394, %900) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10444 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10445 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10446 = "torch.aten.clamp"(%10443, %10444, %10445) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10447 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10448 = "torch.prims.convert_element_type"(%10446, %10447) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10449 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10450 = "torch.aten.unsqueeze"(%902, %10449) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %10451 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10452 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %10453 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10454 = "torch.prim.ListConstruct"(%10451, %10452, %10453) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10455 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10456 = "torch.aten.expand"(%10450, %10454, %10455) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %10457 = "torch_c.to_builtin_tensor"(%10448) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10458 = "torch_c.to_builtin_tensor"(%10456) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %10459 = "util.call"(%10457, %10458) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %10460 = "torch_c.from_builtin_tensor"(%10459) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %10461 = "torch.aten.div.Tensor"(%10460, %904) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %10462 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10463 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10464 = "torch.aten.clamp"(%10461, %10462, %10463) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %10465 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10466 = "torch.prims.convert_element_type"(%10464, %10465) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %10467 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10468 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10469 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10470 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10471 = "torch.prim.ListConstruct"(%10467, %10468, %10469, %10470) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10472 = "torch.aten.view"(%10418, %10471) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %10473 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10474 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10475 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10476 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10477 = "torch.prim.ListConstruct"(%10473, %10474, %10475, %10476) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10478 = "torch.aten.view"(%10442, %10477) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %10479 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10480 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10481 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10482 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10483 = "torch.prim.ListConstruct"(%10479, %10480, %10481, %10482) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10484 = "torch.aten.view"(%10466, %10483) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %10485 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10486 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10487 = "torch.aten.transpose.int"(%10472, %10485, %10486) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10488 = "torch.aten.mul.Tensor"(%10487, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10489 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10490 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10491 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10492 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10493 = "torch.aten.slice.Tensor"(%10487, %10489, %10490, %10491, %10492) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10494 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10495 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10496 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10497 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10498 = "torch.aten.slice.Tensor"(%10487, %10494, %10495, %10496, %10497) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10499 = "torch.aten.neg"(%10498) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10500 = "torch.prim.ListConstruct"(%10499, %10493) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %10501 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10502 = "torch.aten.cat"(%10500, %10501) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10503 = "torch.aten.mul.Tensor"(%10502, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10504 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10505 = "torch.aten.add.Tensor"(%10488, %10503, %10504) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10507 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10508 = "torch.aten.transpose.int"(%10505, %10506, %10507) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %10509 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10510 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10511 = "torch.aten.transpose.int"(%10478, %10509, %10510) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10512 = "torch.aten.mul.Tensor"(%10511, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10513 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10514 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10515 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10516 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10517 = "torch.aten.slice.Tensor"(%10511, %10513, %10514, %10515, %10516) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %10518 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10519 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10520 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10521 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10522 = "torch.aten.slice.Tensor"(%10511, %10518, %10519, %10520, %10521) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %10523 = "torch.aten.neg"(%10522) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %10524 = "torch.prim.ListConstruct"(%10523, %10517) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %10525 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10526 = "torch.aten.cat"(%10524, %10525) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10527 = "torch.aten.mul.Tensor"(%10526, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10528 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10529 = "torch.aten.add.Tensor"(%10512, %10527, %10528) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10530 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10531 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10532 = "torch.aten.transpose.int"(%10529, %10530, %10531) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %10533 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10534 = "torch.aten.floor_divide.Scalar"(%arg64, %10533) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10536 = "torch.aten.unsqueeze"(%10534, %10535) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10537 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10538 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10539 = "torch.aten.gather"(%arg65, %10537, %10536, %10538) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10540 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10541 = "torch.aten.remainder.Scalar"(%arg64, %10540) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10543 = "torch.aten.unsqueeze"(%10541, %10542) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10544 = "torch.constant.none"() : () -> !torch.none
    %10545 = "torch.aten.clone"(%905, %10544) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %10546 = "torch.aten.detach"(%10545) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10547 = "torch.aten.detach"(%10546) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10548 = "torch.aten.detach"(%10547) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10549 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10550 = "torch.aten.unsqueeze"(%10548, %10549) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %10551 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10552 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10553 = "torch.prim.ListConstruct"(%10551, %10552) : (!torch.int, !torch.int) -> !torch.list<int>
    %10554 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10555 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10556 = "torch.prim.ListConstruct"(%10554, %10555) : (!torch.int, !torch.int) -> !torch.list<int>
    %10557 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10558 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10559 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %10560 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10561 = "torch.aten.empty_strided"(%10553, %10556, %10557, %10558, %10559, %10560) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10562 = "torch.constant.int"() <{value = 18 : i64}> : () -> !torch.int
    %10563 = "torch.aten.fill.Scalar"(%10561, %10562) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10564 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10565 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10566 = "torch.prim.ListConstruct"(%10564, %10565) : (!torch.int, !torch.int) -> !torch.list<int>
    %10567 = "torch.aten.repeat"(%10550, %10566) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %10568 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10569 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10570 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10571 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10572 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10573 = "torch.prim.ListConstruct"(%1483, %10568, %10569, %10570, %10571, %10572) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10574 = "torch.aten.view"(%10144, %10573) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10574, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10575 = "torch.prim.ListConstruct"(%10539, %10563, %10567, %10543) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %10576 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10577 = "torch.aten.index_put"(%10574, %10575, %10532, %10576) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10577, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10578 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %10579 = "torch.prim.ListConstruct"(%1483, %10578) : (!torch.int, !torch.int) -> !torch.list<int>
    %10580 = "torch.aten.view"(%10577, %10579) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10580, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %10581 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10582 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10583 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10584 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10585 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10586 = "torch.prim.ListConstruct"(%1483, %10581, %10582, %10583, %10584, %10585) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10587 = "torch.aten.view"(%10580, %10586) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10587, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10588 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10589 = "torch.aten.floor_divide.Scalar"(%arg64, %10588) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10590 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10591 = "torch.aten.unsqueeze"(%10589, %10590) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10592 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10593 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10594 = "torch.aten.gather"(%arg65, %10592, %10591, %10593) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10595 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10596 = "torch.aten.remainder.Scalar"(%arg64, %10595) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %10597 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10598 = "torch.aten.unsqueeze"(%10596, %10597) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10599 = "torch.constant.none"() : () -> !torch.none
    %10600 = "torch.aten.clone"(%906, %10599) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %10601 = "torch.aten.detach"(%10600) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10602 = "torch.aten.detach"(%10601) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10603 = "torch.aten.detach"(%10602) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %10604 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10605 = "torch.aten.unsqueeze"(%10603, %10604) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %10606 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10607 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10608 = "torch.prim.ListConstruct"(%10606, %10607) : (!torch.int, !torch.int) -> !torch.list<int>
    %10609 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10610 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10611 = "torch.prim.ListConstruct"(%10609, %10610) : (!torch.int, !torch.int) -> !torch.list<int>
    %10612 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10613 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10614 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %10615 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10616 = "torch.aten.empty_strided"(%10608, %10611, %10612, %10613, %10614, %10615) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %10617 = "torch.constant.int"() <{value = 18 : i64}> : () -> !torch.int
    %10618 = "torch.aten.fill.Scalar"(%10616, %10617) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %10619 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10621 = "torch.prim.ListConstruct"(%10619, %10620) : (!torch.int, !torch.int) -> !torch.list<int>
    %10622 = "torch.aten.repeat"(%10605, %10621) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %10623 = "torch.prim.ListConstruct"(%10594, %10618, %10622, %10598) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %10624 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10625 = "torch.aten.index_put"(%10587, %10623, %10484, %10624) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10625, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10626 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %10627 = "torch.prim.ListConstruct"(%1483, %10626) : (!torch.int, !torch.int) -> !torch.list<int>
    %10628 = "torch.aten.view"(%10625, %10627) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10628, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %10629 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10630 = "torch.aten.mul.Scalar"(%arg65, %10629) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10630, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10631 = "torch.constant.int"() <{value = 36 : i64}> : () -> !torch.int
    %10632 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10633 = "torch.aten.add.Scalar"(%10630, %10631, %10632) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10633, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10634 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10635 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10636 = "torch.aten.add.Scalar"(%10633, %10634, %10635) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10636, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10637 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %10638 = "torch.aten.view"(%10636, %10637) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%10638, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %10639 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10640 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10641 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10642 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10643 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10644 = "torch.prim.ListConstruct"(%1483, %10639, %10640, %10641, %10642, %10643) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10645 = "torch.aten.view"(%10628, %10644) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10645, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10646 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10647 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10648 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10649 = "torch.prim.ListConstruct"(%1914, %10646, %10647, %10648) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10650 = "torch.aten.view"(%10645, %10649) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10650, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10651 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10652 = "torch.aten.index_select"(%10650, %10651, %10638) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10652, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10653 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10654 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10655 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10656 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10657 = "torch.prim.ListConstruct"(%10653, %1481, %10654, %10655, %10656) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10658 = "torch.aten.view"(%10652, %10657) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10658, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10659 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10660 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10661 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10662 = "torch.prim.ListConstruct"(%10659, %1485, %10660, %10661) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10663 = "torch.aten.view"(%10658, %10662) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10663, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10664 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10665 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10666 = "torch.aten.add.Scalar"(%10633, %10664, %10665) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%10666, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %10667 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %10668 = "torch.aten.view"(%10666, %10667) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%10668, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %10669 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10670 = "torch.aten.index_select"(%10650, %10669, %10668) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10670, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10671 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10672 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10673 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10674 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10675 = "torch.prim.ListConstruct"(%10671, %1481, %10672, %10673, %10674) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10676 = "torch.aten.view"(%10670, %10675) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10676, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10677 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10678 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10679 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10680 = "torch.prim.ListConstruct"(%10677, %1485, %10678, %10679) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10681 = "torch.aten.view"(%10676, %10680) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10681, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10682 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10683 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10684 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10685 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10686 = "torch.aten.slice.Tensor"(%10663, %10682, %10683, %10684, %10685) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10686, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10687 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10688 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10689 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10690 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10691 = "torch.aten.slice.Tensor"(%10681, %10687, %10688, %10689, %10690) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10691, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10692 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %10693 = "torch.aten.unsqueeze"(%10686, %10692) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10693, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10694 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10695 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10696 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10697 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10698 = "torch.prim.ListConstruct"(%10694, %1485, %10695, %10696, %10697) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10699 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10700 = "torch.aten.expand"(%10693, %10698, %10699) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10700, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10701 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10702 = "torch.aten.clone"(%10700, %10701) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10702, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10703 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10704 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10705 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10706 = "torch.prim.ListConstruct"(%10703, %1485, %10704, %10705) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10707 = "torch.aten._unsafe_view"(%10702, %10706) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10707, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10708 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %10709 = "torch.aten.unsqueeze"(%10691, %10708) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10709, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10710 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10711 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10712 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10713 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10714 = "torch.prim.ListConstruct"(%10710, %1485, %10711, %10712, %10713) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10715 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10716 = "torch.aten.expand"(%10709, %10714, %10715) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10716, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10717 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10718 = "torch.aten.clone"(%10716, %10717) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10718, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10719 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10720 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10721 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10722 = "torch.prim.ListConstruct"(%10719, %1485, %10720, %10721) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10723 = "torch.aten._unsafe_view"(%10718, %10722) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10723, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10724 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10725 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10726 = "torch.aten.transpose.int"(%10508, %10724, %10725) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10727 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10728 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10729 = "torch.aten.transpose.int"(%10707, %10727, %10728) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10729, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10730 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10731 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10732 = "torch.aten.transpose.int"(%10723, %10730, %10731) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10732, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %10733 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10734 = "torch.aten.squeeze.dim"(%1516, %10733) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10734, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %10735 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10736 = "torch.aten.squeeze.dim"(%10734, %10735) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%10736, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %10737 = "torch_c.to_builtin_tensor"(%10726) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %10738 = "tensor.cast"(%10737) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %10739 = "torch_c.to_builtin_tensor"(%10729) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %10740 = "torch_c.to_builtin_tensor"(%10732) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %10741 = "torch_c.to_builtin_tensor"(%10736) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %10742 = "tensor.cast"(%10741) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %10743 = "torch_c.to_builtin_tensor"(%908) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %10744 = "util.call"(%10738, %10739, %10740, %10743, %10742) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %10745 = "tensor.cast"(%10744) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %10746 = "torch_c.from_builtin_tensor"(%10745) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %10747 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10748 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10749 = "torch.aten.transpose.int"(%10746, %10747, %10748) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %10750 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10751 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10752 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10753 = "torch.prim.ListConstruct"(%10750, %10751, %10752) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10754 = "torch.aten.view"(%10749, %10753) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %10755 = "torch.aten.div.Tensor"(%10754, %910) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10756 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10757 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10758 = "torch.aten.clamp"(%10755, %10756, %10757) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %10759 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10760 = "torch.prims.convert_element_type"(%10758, %10759) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10761 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10762 = "torch.aten.unsqueeze"(%912, %10761) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %10763 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10764 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10765 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10766 = "torch.prim.ListConstruct"(%10763, %10764, %10765) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10767 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10768 = "torch.aten.expand"(%10762, %10766, %10767) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %10769 = "torch_c.to_builtin_tensor"(%10760) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10770 = "torch_c.to_builtin_tensor"(%10768) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %10771 = "util.call"(%10769, %10770) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %10772 = "torch_c.from_builtin_tensor"(%10771) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %10773 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10774 = "torch.prims.convert_element_type"(%10772, %10773) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10775 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10776 = "torch.aten.add.Tensor"(%10375, %10774, %10775) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10777 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %10778 = "torch.prims.convert_element_type"(%10776, %10777) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10779 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10780 = "torch.aten.pow.Tensor_Scalar"(%10778, %10779) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10781 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10782 = "torch.prim.ListConstruct"(%10781) : (!torch.int) -> !torch.list<int>
    %10783 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %10784 = "torch.constant.none"() : () -> !torch.none
    %10785 = "torch.aten.mean.dim"(%10780, %10782, %10783, %10784) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %10786 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %10787 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10788 = "torch.aten.add.Scalar"(%10785, %10786, %10787) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %10789 = "torch.aten.rsqrt"(%10788) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %10790 = "torch.aten.mul.Tensor"(%10778, %10789) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10791 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10792 = "torch.prims.convert_element_type"(%10790, %10791) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10793 = "torch.aten.mul.Tensor"(%914, %10792) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %10794 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10795 = "torch.prims.convert_element_type"(%10793, %10794) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10796 = "torch.aten.div.Tensor"(%10795, %916) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10797 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10798 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10799 = "torch.aten.clamp"(%10796, %10797, %10798) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10800 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10801 = "torch.prims.convert_element_type"(%10799, %10800) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10802 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10803 = "torch.aten.unsqueeze"(%918, %10802) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %10804 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10805 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %10806 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10807 = "torch.prim.ListConstruct"(%10804, %10805, %10806) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10808 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10809 = "torch.aten.expand"(%10803, %10807, %10808) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %10810 = "torch_c.to_builtin_tensor"(%10801) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10811 = "torch_c.to_builtin_tensor"(%10809) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %10812 = "util.call"(%10810, %10811) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %10813 = "torch_c.from_builtin_tensor"(%10812) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %10814 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10815 = "torch.prims.convert_element_type"(%10813, %10814) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %10816 = "torch.aten.silu"(%10815) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %10817 = "torch.aten.div.Tensor"(%10795, %920) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10818 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10819 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10820 = "torch.aten.clamp"(%10817, %10818, %10819) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10821 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10822 = "torch.prims.convert_element_type"(%10820, %10821) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10823 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10824 = "torch.aten.unsqueeze"(%922, %10823) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %10825 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10826 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %10827 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10828 = "torch.prim.ListConstruct"(%10825, %10826, %10827) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10829 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10830 = "torch.aten.expand"(%10824, %10828, %10829) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %10831 = "torch_c.to_builtin_tensor"(%10822) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10832 = "torch_c.to_builtin_tensor"(%10830) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %10833 = "util.call"(%10831, %10832) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %10834 = "torch_c.from_builtin_tensor"(%10833) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %10835 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10836 = "torch.prims.convert_element_type"(%10834, %10835) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %10837 = "torch.aten.mul.Tensor"(%10816, %10836) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %10838 = "torch.aten.div.Tensor"(%10837, %924) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %10839 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10840 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10841 = "torch.aten.clamp"(%10838, %10839, %10840) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %10842 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10843 = "torch.prims.convert_element_type"(%10841, %10842) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %10844 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10845 = "torch.aten.unsqueeze"(%926, %10844) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %10846 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10847 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10848 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %10849 = "torch.prim.ListConstruct"(%10846, %10847, %10848) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10850 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10851 = "torch.aten.expand"(%10845, %10849, %10850) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %10852 = "torch_c.to_builtin_tensor"(%10843) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %10853 = "torch_c.to_builtin_tensor"(%10851) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %10854 = "util.call"(%10852, %10853) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %10855 = "torch_c.from_builtin_tensor"(%10854) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %10856 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10857 = "torch.prims.convert_element_type"(%10855, %10856) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10858 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10859 = "torch.aten.add.Tensor"(%10776, %10857, %10858) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10860 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %10861 = "torch.prims.convert_element_type"(%10859, %10860) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10862 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10863 = "torch.aten.pow.Tensor_Scalar"(%10861, %10862) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %10864 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10865 = "torch.prim.ListConstruct"(%10864) : (!torch.int) -> !torch.list<int>
    %10866 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %10867 = "torch.constant.none"() : () -> !torch.none
    %10868 = "torch.aten.mean.dim"(%10863, %10865, %10866, %10867) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %10869 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %10870 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10871 = "torch.aten.add.Scalar"(%10868, %10869, %10870) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %10872 = "torch.aten.rsqrt"(%10871) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %10873 = "torch.aten.mul.Tensor"(%10861, %10872) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10874 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10875 = "torch.prims.convert_element_type"(%10873, %10874) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10876 = "torch.aten.mul.Tensor"(%928, %10875) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %10877 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %10878 = "torch.prims.convert_element_type"(%10876, %10877) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %10879 = "torch.aten.div.Tensor"(%10878, %930) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10880 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10881 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10882 = "torch.aten.clamp"(%10879, %10880, %10881) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10883 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10884 = "torch.prims.convert_element_type"(%10882, %10883) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10885 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10886 = "torch.aten.unsqueeze"(%932, %10885) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %10887 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10888 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10889 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10890 = "torch.prim.ListConstruct"(%10887, %10888, %10889) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10891 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10892 = "torch.aten.expand"(%10886, %10890, %10891) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %10893 = "torch_c.to_builtin_tensor"(%10884) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10894 = "torch_c.to_builtin_tensor"(%10892) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %10895 = "util.call"(%10893, %10894) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %10896 = "torch_c.from_builtin_tensor"(%10895) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %10897 = "torch.aten.div.Tensor"(%10896, %934) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %10898 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10899 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10900 = "torch.aten.clamp"(%10897, %10898, %10899) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %10901 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10902 = "torch.prims.convert_element_type"(%10900, %10901) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10903 = "torch.aten.div.Tensor"(%10878, %936) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10904 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10905 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10906 = "torch.aten.clamp"(%10903, %10904, %10905) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10907 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10908 = "torch.prims.convert_element_type"(%10906, %10907) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10909 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10910 = "torch.aten.unsqueeze"(%938, %10909) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %10911 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10912 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %10913 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10914 = "torch.prim.ListConstruct"(%10911, %10912, %10913) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10915 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10916 = "torch.aten.expand"(%10910, %10914, %10915) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %10917 = "torch_c.to_builtin_tensor"(%10908) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10918 = "torch_c.to_builtin_tensor"(%10916) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %10919 = "util.call"(%10917, %10918) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %10920 = "torch_c.from_builtin_tensor"(%10919) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %10921 = "torch.aten.div.Tensor"(%10920, %940) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %10922 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10923 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10924 = "torch.aten.clamp"(%10921, %10922, %10923) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %10925 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10926 = "torch.prims.convert_element_type"(%10924, %10925) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %10927 = "torch.aten.div.Tensor"(%10878, %942) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %10928 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10929 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10930 = "torch.aten.clamp"(%10927, %10928, %10929) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %10931 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10932 = "torch.prims.convert_element_type"(%10930, %10931) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %10933 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10934 = "torch.aten.unsqueeze"(%944, %10933) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %10935 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10936 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %10937 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %10938 = "torch.prim.ListConstruct"(%10935, %10936, %10937) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10939 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %10940 = "torch.aten.expand"(%10934, %10938, %10939) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %10941 = "torch_c.to_builtin_tensor"(%10932) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %10942 = "torch_c.to_builtin_tensor"(%10940) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %10943 = "util.call"(%10941, %10942) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %10944 = "torch_c.from_builtin_tensor"(%10943) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %10945 = "torch.aten.div.Tensor"(%10944, %946) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %10946 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %10947 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %10948 = "torch.aten.clamp"(%10945, %10946, %10947) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %10949 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %10950 = "torch.prims.convert_element_type"(%10948, %10949) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %10951 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10952 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10953 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %10954 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10955 = "torch.prim.ListConstruct"(%10951, %10952, %10953, %10954) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10956 = "torch.aten.view"(%10902, %10955) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %10957 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10958 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10959 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10960 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10961 = "torch.prim.ListConstruct"(%10957, %10958, %10959, %10960) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10962 = "torch.aten.view"(%10926, %10961) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %10963 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %10964 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10965 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %10966 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %10967 = "torch.prim.ListConstruct"(%10963, %10964, %10965, %10966) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10968 = "torch.aten.view"(%10950, %10967) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %10969 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10970 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10971 = "torch.aten.transpose.int"(%10956, %10969, %10970) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10972 = "torch.aten.mul.Tensor"(%10971, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10973 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10974 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10975 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10976 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10977 = "torch.aten.slice.Tensor"(%10971, %10973, %10974, %10975, %10976) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10978 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10979 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %10980 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %10981 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10982 = "torch.aten.slice.Tensor"(%10971, %10978, %10979, %10980, %10981) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10983 = "torch.aten.neg"(%10982) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %10984 = "torch.prim.ListConstruct"(%10983, %10977) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %10985 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %10986 = "torch.aten.cat"(%10984, %10985) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10987 = "torch.aten.mul.Tensor"(%10986, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10988 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10989 = "torch.aten.add.Tensor"(%10972, %10987, %10988) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %10990 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10991 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10992 = "torch.aten.transpose.int"(%10989, %10990, %10991) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %10993 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %10994 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %10995 = "torch.aten.transpose.int"(%10962, %10993, %10994) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10996 = "torch.aten.mul.Tensor"(%10995, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %10997 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %10998 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %10999 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11000 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11001 = "torch.aten.slice.Tensor"(%10995, %10997, %10998, %10999, %11000) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11002 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11003 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11004 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11005 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11006 = "torch.aten.slice.Tensor"(%10995, %11002, %11003, %11004, %11005) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11007 = "torch.aten.neg"(%11006) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11008 = "torch.prim.ListConstruct"(%11007, %11001) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %11009 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11010 = "torch.aten.cat"(%11008, %11009) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11011 = "torch.aten.mul.Tensor"(%11010, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11012 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11013 = "torch.aten.add.Tensor"(%10996, %11011, %11012) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11014 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11015 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11016 = "torch.aten.transpose.int"(%11013, %11014, %11015) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %11017 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11018 = "torch.aten.floor_divide.Scalar"(%arg64, %11017) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11019 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11020 = "torch.aten.unsqueeze"(%11018, %11019) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11021 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11022 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11023 = "torch.aten.gather"(%arg65, %11021, %11020, %11022) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11024 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11025 = "torch.aten.remainder.Scalar"(%arg64, %11024) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11026 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11027 = "torch.aten.unsqueeze"(%11025, %11026) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11028 = "torch.constant.none"() : () -> !torch.none
    %11029 = "torch.aten.clone"(%947, %11028) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %11030 = "torch.aten.detach"(%11029) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11031 = "torch.aten.detach"(%11030) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11032 = "torch.aten.detach"(%11031) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11033 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11034 = "torch.aten.unsqueeze"(%11032, %11033) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %11035 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11036 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11037 = "torch.prim.ListConstruct"(%11035, %11036) : (!torch.int, !torch.int) -> !torch.list<int>
    %11038 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11039 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11040 = "torch.prim.ListConstruct"(%11038, %11039) : (!torch.int, !torch.int) -> !torch.list<int>
    %11041 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11042 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11043 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %11044 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11045 = "torch.aten.empty_strided"(%11037, %11040, %11041, %11042, %11043, %11044) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11046 = "torch.constant.int"() <{value = 19 : i64}> : () -> !torch.int
    %11047 = "torch.aten.fill.Scalar"(%11045, %11046) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11048 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11049 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11050 = "torch.prim.ListConstruct"(%11048, %11049) : (!torch.int, !torch.int) -> !torch.list<int>
    %11051 = "torch.aten.repeat"(%11034, %11050) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %11052 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11053 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11054 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11055 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11056 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11057 = "torch.prim.ListConstruct"(%1483, %11052, %11053, %11054, %11055, %11056) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11058 = "torch.aten.view"(%10628, %11057) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11058, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11059 = "torch.prim.ListConstruct"(%11023, %11047, %11051, %11027) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %11060 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11061 = "torch.aten.index_put"(%11058, %11059, %11016, %11060) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11061, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11062 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %11063 = "torch.prim.ListConstruct"(%1483, %11062) : (!torch.int, !torch.int) -> !torch.list<int>
    %11064 = "torch.aten.view"(%11061, %11063) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11064, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %11065 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11066 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11067 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11068 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11069 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11070 = "torch.prim.ListConstruct"(%1483, %11065, %11066, %11067, %11068, %11069) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11071 = "torch.aten.view"(%11064, %11070) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11071, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11072 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11073 = "torch.aten.floor_divide.Scalar"(%arg64, %11072) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11074 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11075 = "torch.aten.unsqueeze"(%11073, %11074) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11076 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11077 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11078 = "torch.aten.gather"(%arg65, %11076, %11075, %11077) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11079 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11080 = "torch.aten.remainder.Scalar"(%arg64, %11079) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11081 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11082 = "torch.aten.unsqueeze"(%11080, %11081) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11083 = "torch.constant.none"() : () -> !torch.none
    %11084 = "torch.aten.clone"(%948, %11083) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %11085 = "torch.aten.detach"(%11084) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11086 = "torch.aten.detach"(%11085) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11087 = "torch.aten.detach"(%11086) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11088 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11089 = "torch.aten.unsqueeze"(%11087, %11088) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %11090 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11091 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11092 = "torch.prim.ListConstruct"(%11090, %11091) : (!torch.int, !torch.int) -> !torch.list<int>
    %11093 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11094 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11095 = "torch.prim.ListConstruct"(%11093, %11094) : (!torch.int, !torch.int) -> !torch.list<int>
    %11096 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11097 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11098 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %11099 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11100 = "torch.aten.empty_strided"(%11092, %11095, %11096, %11097, %11098, %11099) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11101 = "torch.constant.int"() <{value = 19 : i64}> : () -> !torch.int
    %11102 = "torch.aten.fill.Scalar"(%11100, %11101) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11103 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11104 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11105 = "torch.prim.ListConstruct"(%11103, %11104) : (!torch.int, !torch.int) -> !torch.list<int>
    %11106 = "torch.aten.repeat"(%11089, %11105) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %11107 = "torch.prim.ListConstruct"(%11078, %11102, %11106, %11082) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %11108 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11109 = "torch.aten.index_put"(%11071, %11107, %10968, %11108) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11109, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11110 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %11111 = "torch.prim.ListConstruct"(%1483, %11110) : (!torch.int, !torch.int) -> !torch.list<int>
    %11112 = "torch.aten.view"(%11109, %11111) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11112, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %11113 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11114 = "torch.aten.mul.Scalar"(%arg65, %11113) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11114, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11115 = "torch.constant.int"() <{value = 38 : i64}> : () -> !torch.int
    %11116 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11117 = "torch.aten.add.Scalar"(%11114, %11115, %11116) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11117, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11118 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11119 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11120 = "torch.aten.add.Scalar"(%11117, %11118, %11119) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11120, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11121 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %11122 = "torch.aten.view"(%11120, %11121) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%11122, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %11123 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11124 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11125 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11126 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11127 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11128 = "torch.prim.ListConstruct"(%1483, %11123, %11124, %11125, %11126, %11127) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11129 = "torch.aten.view"(%11112, %11128) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11129, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11130 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11131 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11132 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11133 = "torch.prim.ListConstruct"(%1914, %11130, %11131, %11132) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11134 = "torch.aten.view"(%11129, %11133) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11134, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11135 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11136 = "torch.aten.index_select"(%11134, %11135, %11122) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11136, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11137 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11138 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11139 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11140 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11141 = "torch.prim.ListConstruct"(%11137, %1481, %11138, %11139, %11140) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11142 = "torch.aten.view"(%11136, %11141) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11142, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11143 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11144 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11145 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11146 = "torch.prim.ListConstruct"(%11143, %1485, %11144, %11145) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11147 = "torch.aten.view"(%11142, %11146) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11147, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11148 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11149 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11150 = "torch.aten.add.Scalar"(%11117, %11148, %11149) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11150, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11151 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %11152 = "torch.aten.view"(%11150, %11151) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%11152, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %11153 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11154 = "torch.aten.index_select"(%11134, %11153, %11152) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11154, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11155 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11156 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11157 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11158 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11159 = "torch.prim.ListConstruct"(%11155, %1481, %11156, %11157, %11158) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11160 = "torch.aten.view"(%11154, %11159) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11160, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11161 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11162 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11163 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11164 = "torch.prim.ListConstruct"(%11161, %1485, %11162, %11163) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11165 = "torch.aten.view"(%11160, %11164) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11165, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11166 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11167 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11168 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11169 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11170 = "torch.aten.slice.Tensor"(%11147, %11166, %11167, %11168, %11169) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11170, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11171 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11172 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11173 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11174 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11175 = "torch.aten.slice.Tensor"(%11165, %11171, %11172, %11173, %11174) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11175, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11176 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %11177 = "torch.aten.unsqueeze"(%11170, %11176) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11177, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11178 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11179 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11180 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11181 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11182 = "torch.prim.ListConstruct"(%11178, %1485, %11179, %11180, %11181) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11183 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11184 = "torch.aten.expand"(%11177, %11182, %11183) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11184, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11185 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11186 = "torch.aten.clone"(%11184, %11185) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11186, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11187 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11188 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11189 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11190 = "torch.prim.ListConstruct"(%11187, %1485, %11188, %11189) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11191 = "torch.aten._unsafe_view"(%11186, %11190) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11191, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11192 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %11193 = "torch.aten.unsqueeze"(%11175, %11192) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11193, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11194 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11195 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11196 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11197 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11198 = "torch.prim.ListConstruct"(%11194, %1485, %11195, %11196, %11197) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11199 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11200 = "torch.aten.expand"(%11193, %11198, %11199) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11200, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11201 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11202 = "torch.aten.clone"(%11200, %11201) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11202, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11203 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11204 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11205 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11206 = "torch.prim.ListConstruct"(%11203, %1485, %11204, %11205) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11207 = "torch.aten._unsafe_view"(%11202, %11206) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11207, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11208 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11209 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11210 = "torch.aten.transpose.int"(%10992, %11208, %11209) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11211 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11212 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11213 = "torch.aten.transpose.int"(%11191, %11211, %11212) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11213, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11215 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11216 = "torch.aten.transpose.int"(%11207, %11214, %11215) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11216, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11217 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11218 = "torch.aten.squeeze.dim"(%1516, %11217) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11218, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %11219 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11220 = "torch.aten.squeeze.dim"(%11218, %11219) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11220, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %11221 = "torch_c.to_builtin_tensor"(%11210) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %11222 = "tensor.cast"(%11221) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %11223 = "torch_c.to_builtin_tensor"(%11213) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %11224 = "torch_c.to_builtin_tensor"(%11216) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %11225 = "torch_c.to_builtin_tensor"(%11220) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %11226 = "tensor.cast"(%11225) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %11227 = "torch_c.to_builtin_tensor"(%950) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %11228 = "util.call"(%11222, %11223, %11224, %11227, %11226) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %11229 = "tensor.cast"(%11228) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %11230 = "torch_c.from_builtin_tensor"(%11229) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %11231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11232 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11233 = "torch.aten.transpose.int"(%11230, %11231, %11232) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %11234 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11235 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11236 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11237 = "torch.prim.ListConstruct"(%11234, %11235, %11236) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11238 = "torch.aten.view"(%11233, %11237) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %11239 = "torch.aten.div.Tensor"(%11238, %952) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11240 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11241 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11242 = "torch.aten.clamp"(%11239, %11240, %11241) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %11243 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11244 = "torch.prims.convert_element_type"(%11242, %11243) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11245 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11246 = "torch.aten.unsqueeze"(%954, %11245) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %11247 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11248 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11249 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11250 = "torch.prim.ListConstruct"(%11247, %11248, %11249) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11251 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11252 = "torch.aten.expand"(%11246, %11250, %11251) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %11253 = "torch_c.to_builtin_tensor"(%11244) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11254 = "torch_c.to_builtin_tensor"(%11252) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %11255 = "util.call"(%11253, %11254) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %11256 = "torch_c.from_builtin_tensor"(%11255) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %11257 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11258 = "torch.prims.convert_element_type"(%11256, %11257) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11259 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11260 = "torch.aten.add.Tensor"(%10859, %11258, %11259) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11261 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %11262 = "torch.prims.convert_element_type"(%11260, %11261) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11263 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11264 = "torch.aten.pow.Tensor_Scalar"(%11262, %11263) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11265 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11266 = "torch.prim.ListConstruct"(%11265) : (!torch.int) -> !torch.list<int>
    %11267 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %11268 = "torch.constant.none"() : () -> !torch.none
    %11269 = "torch.aten.mean.dim"(%11264, %11266, %11267, %11268) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %11270 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %11271 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11272 = "torch.aten.add.Scalar"(%11269, %11270, %11271) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %11273 = "torch.aten.rsqrt"(%11272) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %11274 = "torch.aten.mul.Tensor"(%11262, %11273) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11275 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11276 = "torch.prims.convert_element_type"(%11274, %11275) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11277 = "torch.aten.mul.Tensor"(%956, %11276) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %11278 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11279 = "torch.prims.convert_element_type"(%11277, %11278) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11280 = "torch.aten.div.Tensor"(%11279, %958) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11281 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11282 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11283 = "torch.aten.clamp"(%11280, %11281, %11282) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11284 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11285 = "torch.prims.convert_element_type"(%11283, %11284) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11286 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11287 = "torch.aten.unsqueeze"(%960, %11286) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %11288 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11289 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %11290 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11291 = "torch.prim.ListConstruct"(%11288, %11289, %11290) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11292 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11293 = "torch.aten.expand"(%11287, %11291, %11292) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %11294 = "torch_c.to_builtin_tensor"(%11285) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11295 = "torch_c.to_builtin_tensor"(%11293) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %11296 = "util.call"(%11294, %11295) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %11297 = "torch_c.from_builtin_tensor"(%11296) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %11298 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11299 = "torch.prims.convert_element_type"(%11297, %11298) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %11300 = "torch.aten.silu"(%11299) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %11301 = "torch.aten.div.Tensor"(%11279, %962) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11302 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11303 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11304 = "torch.aten.clamp"(%11301, %11302, %11303) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11305 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11306 = "torch.prims.convert_element_type"(%11304, %11305) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11307 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11308 = "torch.aten.unsqueeze"(%964, %11307) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %11309 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11310 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %11311 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11312 = "torch.prim.ListConstruct"(%11309, %11310, %11311) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11313 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11314 = "torch.aten.expand"(%11308, %11312, %11313) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %11315 = "torch_c.to_builtin_tensor"(%11306) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11316 = "torch_c.to_builtin_tensor"(%11314) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %11317 = "util.call"(%11315, %11316) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %11318 = "torch_c.from_builtin_tensor"(%11317) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %11319 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11320 = "torch.prims.convert_element_type"(%11318, %11319) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %11321 = "torch.aten.mul.Tensor"(%11300, %11320) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %11322 = "torch.aten.div.Tensor"(%11321, %966) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %11323 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11324 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11325 = "torch.aten.clamp"(%11322, %11323, %11324) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %11326 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11327 = "torch.prims.convert_element_type"(%11325, %11326) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %11328 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11329 = "torch.aten.unsqueeze"(%968, %11328) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %11330 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11331 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11332 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %11333 = "torch.prim.ListConstruct"(%11330, %11331, %11332) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11334 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11335 = "torch.aten.expand"(%11329, %11333, %11334) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %11336 = "torch_c.to_builtin_tensor"(%11327) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %11337 = "torch_c.to_builtin_tensor"(%11335) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %11338 = "util.call"(%11336, %11337) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %11339 = "torch_c.from_builtin_tensor"(%11338) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %11340 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11341 = "torch.prims.convert_element_type"(%11339, %11340) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11342 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11343 = "torch.aten.add.Tensor"(%11260, %11341, %11342) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11344 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %11345 = "torch.prims.convert_element_type"(%11343, %11344) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11346 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11347 = "torch.aten.pow.Tensor_Scalar"(%11345, %11346) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11348 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11349 = "torch.prim.ListConstruct"(%11348) : (!torch.int) -> !torch.list<int>
    %11350 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %11351 = "torch.constant.none"() : () -> !torch.none
    %11352 = "torch.aten.mean.dim"(%11347, %11349, %11350, %11351) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %11353 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %11354 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11355 = "torch.aten.add.Scalar"(%11352, %11353, %11354) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %11356 = "torch.aten.rsqrt"(%11355) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %11357 = "torch.aten.mul.Tensor"(%11345, %11356) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11358 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11359 = "torch.prims.convert_element_type"(%11357, %11358) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11360 = "torch.aten.mul.Tensor"(%970, %11359) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %11361 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11362 = "torch.prims.convert_element_type"(%11360, %11361) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11363 = "torch.aten.div.Tensor"(%11362, %972) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11364 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11365 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11366 = "torch.aten.clamp"(%11363, %11364, %11365) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11367 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11368 = "torch.prims.convert_element_type"(%11366, %11367) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11369 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11370 = "torch.aten.unsqueeze"(%974, %11369) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %11371 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11372 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11373 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11374 = "torch.prim.ListConstruct"(%11371, %11372, %11373) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11375 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11376 = "torch.aten.expand"(%11370, %11374, %11375) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %11377 = "torch_c.to_builtin_tensor"(%11368) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11378 = "torch_c.to_builtin_tensor"(%11376) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %11379 = "util.call"(%11377, %11378) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %11380 = "torch_c.from_builtin_tensor"(%11379) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %11381 = "torch.aten.div.Tensor"(%11380, %976) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11382 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11383 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11384 = "torch.aten.clamp"(%11381, %11382, %11383) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %11385 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11386 = "torch.prims.convert_element_type"(%11384, %11385) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11387 = "torch.aten.div.Tensor"(%11362, %978) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11388 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11389 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11390 = "torch.aten.clamp"(%11387, %11388, %11389) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11391 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11392 = "torch.prims.convert_element_type"(%11390, %11391) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11393 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11394 = "torch.aten.unsqueeze"(%980, %11393) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %11395 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11396 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %11397 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11398 = "torch.prim.ListConstruct"(%11395, %11396, %11397) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11399 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11400 = "torch.aten.expand"(%11394, %11398, %11399) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %11401 = "torch_c.to_builtin_tensor"(%11392) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11402 = "torch_c.to_builtin_tensor"(%11400) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %11403 = "util.call"(%11401, %11402) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %11404 = "torch_c.from_builtin_tensor"(%11403) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %11405 = "torch.aten.div.Tensor"(%11404, %982) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %11406 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11407 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11408 = "torch.aten.clamp"(%11405, %11406, %11407) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %11409 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11410 = "torch.prims.convert_element_type"(%11408, %11409) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %11411 = "torch.aten.div.Tensor"(%11362, %984) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11412 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11413 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11414 = "torch.aten.clamp"(%11411, %11412, %11413) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11415 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11416 = "torch.prims.convert_element_type"(%11414, %11415) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11417 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11418 = "torch.aten.unsqueeze"(%986, %11417) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %11419 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11420 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %11421 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11422 = "torch.prim.ListConstruct"(%11419, %11420, %11421) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11423 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11424 = "torch.aten.expand"(%11418, %11422, %11423) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %11425 = "torch_c.to_builtin_tensor"(%11416) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11426 = "torch_c.to_builtin_tensor"(%11424) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %11427 = "util.call"(%11425, %11426) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %11428 = "torch_c.from_builtin_tensor"(%11427) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %11429 = "torch.aten.div.Tensor"(%11428, %988) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %11430 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11431 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11432 = "torch.aten.clamp"(%11429, %11430, %11431) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %11433 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11434 = "torch.prims.convert_element_type"(%11432, %11433) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %11435 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11437 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11438 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11439 = "torch.prim.ListConstruct"(%11435, %11436, %11437, %11438) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11440 = "torch.aten.view"(%11386, %11439) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %11441 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11442 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11443 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11444 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11445 = "torch.prim.ListConstruct"(%11441, %11442, %11443, %11444) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11446 = "torch.aten.view"(%11410, %11445) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %11447 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11448 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11449 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11450 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11451 = "torch.prim.ListConstruct"(%11447, %11448, %11449, %11450) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11452 = "torch.aten.view"(%11434, %11451) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %11453 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11454 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11455 = "torch.aten.transpose.int"(%11440, %11453, %11454) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11456 = "torch.aten.mul.Tensor"(%11455, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11457 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11458 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11459 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11460 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11461 = "torch.aten.slice.Tensor"(%11455, %11457, %11458, %11459, %11460) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %11462 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11463 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11464 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11465 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11466 = "torch.aten.slice.Tensor"(%11455, %11462, %11463, %11464, %11465) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %11467 = "torch.aten.neg"(%11466) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %11468 = "torch.prim.ListConstruct"(%11467, %11461) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %11469 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11470 = "torch.aten.cat"(%11468, %11469) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11471 = "torch.aten.mul.Tensor"(%11470, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11472 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11473 = "torch.aten.add.Tensor"(%11456, %11471, %11472) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11474 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11475 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11476 = "torch.aten.transpose.int"(%11473, %11474, %11475) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %11477 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11478 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11479 = "torch.aten.transpose.int"(%11446, %11477, %11478) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11480 = "torch.aten.mul.Tensor"(%11479, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11481 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11482 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11483 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11484 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11485 = "torch.aten.slice.Tensor"(%11479, %11481, %11482, %11483, %11484) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11486 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11487 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11488 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11489 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11490 = "torch.aten.slice.Tensor"(%11479, %11486, %11487, %11488, %11489) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11491 = "torch.aten.neg"(%11490) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11492 = "torch.prim.ListConstruct"(%11491, %11485) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %11493 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11494 = "torch.aten.cat"(%11492, %11493) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11495 = "torch.aten.mul.Tensor"(%11494, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11496 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11497 = "torch.aten.add.Tensor"(%11480, %11495, %11496) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11498 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11499 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11500 = "torch.aten.transpose.int"(%11497, %11498, %11499) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %11501 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11502 = "torch.aten.floor_divide.Scalar"(%arg64, %11501) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11503 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11504 = "torch.aten.unsqueeze"(%11502, %11503) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11505 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11506 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11507 = "torch.aten.gather"(%arg65, %11505, %11504, %11506) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11508 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11509 = "torch.aten.remainder.Scalar"(%arg64, %11508) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11510 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11511 = "torch.aten.unsqueeze"(%11509, %11510) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11512 = "torch.constant.none"() : () -> !torch.none
    %11513 = "torch.aten.clone"(%989, %11512) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %11514 = "torch.aten.detach"(%11513) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11515 = "torch.aten.detach"(%11514) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11516 = "torch.aten.detach"(%11515) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11518 = "torch.aten.unsqueeze"(%11516, %11517) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %11519 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11520 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11521 = "torch.prim.ListConstruct"(%11519, %11520) : (!torch.int, !torch.int) -> !torch.list<int>
    %11522 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11523 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11524 = "torch.prim.ListConstruct"(%11522, %11523) : (!torch.int, !torch.int) -> !torch.list<int>
    %11525 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11526 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11527 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %11528 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11529 = "torch.aten.empty_strided"(%11521, %11524, %11525, %11526, %11527, %11528) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11530 = "torch.constant.int"() <{value = 20 : i64}> : () -> !torch.int
    %11531 = "torch.aten.fill.Scalar"(%11529, %11530) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11532 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11534 = "torch.prim.ListConstruct"(%11532, %11533) : (!torch.int, !torch.int) -> !torch.list<int>
    %11535 = "torch.aten.repeat"(%11518, %11534) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %11536 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11537 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11538 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11539 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11540 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11541 = "torch.prim.ListConstruct"(%1483, %11536, %11537, %11538, %11539, %11540) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11542 = "torch.aten.view"(%11112, %11541) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11542, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11543 = "torch.prim.ListConstruct"(%11507, %11531, %11535, %11511) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %11544 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11545 = "torch.aten.index_put"(%11542, %11543, %11500, %11544) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11545, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11546 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %11547 = "torch.prim.ListConstruct"(%1483, %11546) : (!torch.int, !torch.int) -> !torch.list<int>
    %11548 = "torch.aten.view"(%11545, %11547) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11548, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %11549 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11550 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11551 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11552 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11553 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11554 = "torch.prim.ListConstruct"(%1483, %11549, %11550, %11551, %11552, %11553) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11555 = "torch.aten.view"(%11548, %11554) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11555, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11556 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11557 = "torch.aten.floor_divide.Scalar"(%arg64, %11556) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11558 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11559 = "torch.aten.unsqueeze"(%11557, %11558) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11560 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11561 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11562 = "torch.aten.gather"(%arg65, %11560, %11559, %11561) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11563 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11564 = "torch.aten.remainder.Scalar"(%arg64, %11563) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11565 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11566 = "torch.aten.unsqueeze"(%11564, %11565) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11567 = "torch.constant.none"() : () -> !torch.none
    %11568 = "torch.aten.clone"(%990, %11567) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %11569 = "torch.aten.detach"(%11568) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11570 = "torch.aten.detach"(%11569) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11571 = "torch.aten.detach"(%11570) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11572 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11573 = "torch.aten.unsqueeze"(%11571, %11572) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %11574 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11575 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11576 = "torch.prim.ListConstruct"(%11574, %11575) : (!torch.int, !torch.int) -> !torch.list<int>
    %11577 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11578 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11579 = "torch.prim.ListConstruct"(%11577, %11578) : (!torch.int, !torch.int) -> !torch.list<int>
    %11580 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11581 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11582 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %11583 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11584 = "torch.aten.empty_strided"(%11576, %11579, %11580, %11581, %11582, %11583) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11585 = "torch.constant.int"() <{value = 20 : i64}> : () -> !torch.int
    %11586 = "torch.aten.fill.Scalar"(%11584, %11585) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11587 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11588 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11589 = "torch.prim.ListConstruct"(%11587, %11588) : (!torch.int, !torch.int) -> !torch.list<int>
    %11590 = "torch.aten.repeat"(%11573, %11589) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %11591 = "torch.prim.ListConstruct"(%11562, %11586, %11590, %11566) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %11592 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11593 = "torch.aten.index_put"(%11555, %11591, %11452, %11592) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11593, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11594 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %11595 = "torch.prim.ListConstruct"(%1483, %11594) : (!torch.int, !torch.int) -> !torch.list<int>
    %11596 = "torch.aten.view"(%11593, %11595) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11596, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %11597 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11598 = "torch.aten.mul.Scalar"(%arg65, %11597) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11598, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11599 = "torch.constant.int"() <{value = 40 : i64}> : () -> !torch.int
    %11600 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11601 = "torch.aten.add.Scalar"(%11598, %11599, %11600) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11601, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11602 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11603 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11604 = "torch.aten.add.Scalar"(%11601, %11602, %11603) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11604, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11605 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %11606 = "torch.aten.view"(%11604, %11605) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%11606, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %11607 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11608 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11609 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11610 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11611 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11612 = "torch.prim.ListConstruct"(%1483, %11607, %11608, %11609, %11610, %11611) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11613 = "torch.aten.view"(%11596, %11612) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11613, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11614 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11615 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11616 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11617 = "torch.prim.ListConstruct"(%1914, %11614, %11615, %11616) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11618 = "torch.aten.view"(%11613, %11617) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11618, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11619 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11620 = "torch.aten.index_select"(%11618, %11619, %11606) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11620, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11621 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11622 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11623 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11624 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11625 = "torch.prim.ListConstruct"(%11621, %1481, %11622, %11623, %11624) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11626 = "torch.aten.view"(%11620, %11625) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11626, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11627 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11628 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11629 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11630 = "torch.prim.ListConstruct"(%11627, %1485, %11628, %11629) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11631 = "torch.aten.view"(%11626, %11630) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11631, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11632 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11633 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11634 = "torch.aten.add.Scalar"(%11601, %11632, %11633) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%11634, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %11635 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %11636 = "torch.aten.view"(%11634, %11635) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%11636, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %11637 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11638 = "torch.aten.index_select"(%11618, %11637, %11636) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11638, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11639 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11640 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11641 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11642 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11643 = "torch.prim.ListConstruct"(%11639, %1481, %11640, %11641, %11642) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11644 = "torch.aten.view"(%11638, %11643) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11644, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11645 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11646 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11647 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11648 = "torch.prim.ListConstruct"(%11645, %1485, %11646, %11647) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11649 = "torch.aten.view"(%11644, %11648) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11649, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11650 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11651 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11652 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11653 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11654 = "torch.aten.slice.Tensor"(%11631, %11650, %11651, %11652, %11653) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11654, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11655 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11656 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11657 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11658 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11659 = "torch.aten.slice.Tensor"(%11649, %11655, %11656, %11657, %11658) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11659, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11660 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %11661 = "torch.aten.unsqueeze"(%11654, %11660) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11661, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11662 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11663 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11664 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11665 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11666 = "torch.prim.ListConstruct"(%11662, %1485, %11663, %11664, %11665) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11667 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11668 = "torch.aten.expand"(%11661, %11666, %11667) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11668, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11669 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11670 = "torch.aten.clone"(%11668, %11669) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11670, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11671 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11672 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11673 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11674 = "torch.prim.ListConstruct"(%11671, %1485, %11672, %11673) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11675 = "torch.aten._unsafe_view"(%11670, %11674) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11675, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11676 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %11677 = "torch.aten.unsqueeze"(%11659, %11676) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11677, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11678 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11679 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11680 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11681 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11682 = "torch.prim.ListConstruct"(%11678, %1485, %11679, %11680, %11681) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11683 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11684 = "torch.aten.expand"(%11677, %11682, %11683) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11684, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11685 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11686 = "torch.aten.clone"(%11684, %11685) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11686, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11687 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11688 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11689 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11690 = "torch.prim.ListConstruct"(%11687, %1485, %11688, %11689) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11691 = "torch.aten._unsafe_view"(%11686, %11690) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11691, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11692 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11693 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11694 = "torch.aten.transpose.int"(%11476, %11692, %11693) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11695 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11696 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11697 = "torch.aten.transpose.int"(%11675, %11695, %11696) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11697, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11698 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11699 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11700 = "torch.aten.transpose.int"(%11691, %11698, %11699) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11700, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %11701 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11702 = "torch.aten.squeeze.dim"(%1516, %11701) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11702, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %11703 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11704 = "torch.aten.squeeze.dim"(%11702, %11703) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%11704, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %11705 = "torch_c.to_builtin_tensor"(%11694) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %11706 = "tensor.cast"(%11705) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %11707 = "torch_c.to_builtin_tensor"(%11697) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %11708 = "torch_c.to_builtin_tensor"(%11700) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %11709 = "torch_c.to_builtin_tensor"(%11704) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %11710 = "tensor.cast"(%11709) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %11711 = "torch_c.to_builtin_tensor"(%992) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %11712 = "util.call"(%11706, %11707, %11708, %11711, %11710) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %11713 = "tensor.cast"(%11712) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %11714 = "torch_c.from_builtin_tensor"(%11713) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %11715 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11716 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11717 = "torch.aten.transpose.int"(%11714, %11715, %11716) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %11718 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11719 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11720 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11721 = "torch.prim.ListConstruct"(%11718, %11719, %11720) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11722 = "torch.aten.view"(%11717, %11721) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %11723 = "torch.aten.div.Tensor"(%11722, %994) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11724 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11725 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11726 = "torch.aten.clamp"(%11723, %11724, %11725) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %11727 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11728 = "torch.prims.convert_element_type"(%11726, %11727) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11729 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11730 = "torch.aten.unsqueeze"(%996, %11729) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %11731 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11732 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11733 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11734 = "torch.prim.ListConstruct"(%11731, %11732, %11733) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11735 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11736 = "torch.aten.expand"(%11730, %11734, %11735) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %11737 = "torch_c.to_builtin_tensor"(%11728) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11738 = "torch_c.to_builtin_tensor"(%11736) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %11739 = "util.call"(%11737, %11738) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %11740 = "torch_c.from_builtin_tensor"(%11739) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %11741 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11742 = "torch.prims.convert_element_type"(%11740, %11741) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11743 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11744 = "torch.aten.add.Tensor"(%11343, %11742, %11743) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11745 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %11746 = "torch.prims.convert_element_type"(%11744, %11745) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11747 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11748 = "torch.aten.pow.Tensor_Scalar"(%11746, %11747) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11749 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11750 = "torch.prim.ListConstruct"(%11749) : (!torch.int) -> !torch.list<int>
    %11751 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %11752 = "torch.constant.none"() : () -> !torch.none
    %11753 = "torch.aten.mean.dim"(%11748, %11750, %11751, %11752) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %11754 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %11755 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11756 = "torch.aten.add.Scalar"(%11753, %11754, %11755) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %11757 = "torch.aten.rsqrt"(%11756) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %11758 = "torch.aten.mul.Tensor"(%11746, %11757) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11759 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11760 = "torch.prims.convert_element_type"(%11758, %11759) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11761 = "torch.aten.mul.Tensor"(%998, %11760) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %11762 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11763 = "torch.prims.convert_element_type"(%11761, %11762) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11764 = "torch.aten.div.Tensor"(%11763, %1000) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11765 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11766 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11767 = "torch.aten.clamp"(%11764, %11765, %11766) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11768 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11769 = "torch.prims.convert_element_type"(%11767, %11768) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11770 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11771 = "torch.aten.unsqueeze"(%1002, %11770) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %11772 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11773 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %11774 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11775 = "torch.prim.ListConstruct"(%11772, %11773, %11774) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11776 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11777 = "torch.aten.expand"(%11771, %11775, %11776) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %11778 = "torch_c.to_builtin_tensor"(%11769) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11779 = "torch_c.to_builtin_tensor"(%11777) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %11780 = "util.call"(%11778, %11779) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %11781 = "torch_c.from_builtin_tensor"(%11780) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %11782 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11783 = "torch.prims.convert_element_type"(%11781, %11782) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %11784 = "torch.aten.silu"(%11783) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %11785 = "torch.aten.div.Tensor"(%11763, %1004) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11786 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11787 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11788 = "torch.aten.clamp"(%11785, %11786, %11787) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11789 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11790 = "torch.prims.convert_element_type"(%11788, %11789) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11791 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11792 = "torch.aten.unsqueeze"(%1006, %11791) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %11793 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11794 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %11795 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11796 = "torch.prim.ListConstruct"(%11793, %11794, %11795) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11797 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11798 = "torch.aten.expand"(%11792, %11796, %11797) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %11799 = "torch_c.to_builtin_tensor"(%11790) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11800 = "torch_c.to_builtin_tensor"(%11798) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %11801 = "util.call"(%11799, %11800) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %11802 = "torch_c.from_builtin_tensor"(%11801) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %11803 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11804 = "torch.prims.convert_element_type"(%11802, %11803) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %11805 = "torch.aten.mul.Tensor"(%11784, %11804) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %11806 = "torch.aten.div.Tensor"(%11805, %1008) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %11807 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11808 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11809 = "torch.aten.clamp"(%11806, %11807, %11808) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %11810 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11811 = "torch.prims.convert_element_type"(%11809, %11810) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %11812 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11813 = "torch.aten.unsqueeze"(%1010, %11812) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %11814 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11815 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11816 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %11817 = "torch.prim.ListConstruct"(%11814, %11815, %11816) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11818 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11819 = "torch.aten.expand"(%11813, %11817, %11818) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %11820 = "torch_c.to_builtin_tensor"(%11811) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %11821 = "torch_c.to_builtin_tensor"(%11819) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %11822 = "util.call"(%11820, %11821) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %11823 = "torch_c.from_builtin_tensor"(%11822) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %11824 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11825 = "torch.prims.convert_element_type"(%11823, %11824) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11826 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11827 = "torch.aten.add.Tensor"(%11744, %11825, %11826) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11828 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %11829 = "torch.prims.convert_element_type"(%11827, %11828) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11830 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11831 = "torch.aten.pow.Tensor_Scalar"(%11829, %11830) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %11832 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11833 = "torch.prim.ListConstruct"(%11832) : (!torch.int) -> !torch.list<int>
    %11834 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %11835 = "torch.constant.none"() : () -> !torch.none
    %11836 = "torch.aten.mean.dim"(%11831, %11833, %11834, %11835) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %11837 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %11838 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11839 = "torch.aten.add.Scalar"(%11836, %11837, %11838) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %11840 = "torch.aten.rsqrt"(%11839) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %11841 = "torch.aten.mul.Tensor"(%11829, %11840) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11842 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11843 = "torch.prims.convert_element_type"(%11841, %11842) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11844 = "torch.aten.mul.Tensor"(%1012, %11843) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %11845 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %11846 = "torch.prims.convert_element_type"(%11844, %11845) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %11847 = "torch.aten.div.Tensor"(%11846, %1014) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11848 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11849 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11850 = "torch.aten.clamp"(%11847, %11848, %11849) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11851 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11852 = "torch.prims.convert_element_type"(%11850, %11851) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11853 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11854 = "torch.aten.unsqueeze"(%1016, %11853) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %11855 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11856 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11857 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11858 = "torch.prim.ListConstruct"(%11855, %11856, %11857) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11859 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11860 = "torch.aten.expand"(%11854, %11858, %11859) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %11861 = "torch_c.to_builtin_tensor"(%11852) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11862 = "torch_c.to_builtin_tensor"(%11860) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %11863 = "util.call"(%11861, %11862) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %11864 = "torch_c.from_builtin_tensor"(%11863) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %11865 = "torch.aten.div.Tensor"(%11864, %1018) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %11866 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11867 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11868 = "torch.aten.clamp"(%11865, %11866, %11867) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %11869 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11870 = "torch.prims.convert_element_type"(%11868, %11869) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11871 = "torch.aten.div.Tensor"(%11846, %1020) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11872 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11873 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11874 = "torch.aten.clamp"(%11871, %11872, %11873) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11875 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11876 = "torch.prims.convert_element_type"(%11874, %11875) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11877 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11878 = "torch.aten.unsqueeze"(%1022, %11877) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %11879 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11880 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %11881 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11882 = "torch.prim.ListConstruct"(%11879, %11880, %11881) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11883 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11884 = "torch.aten.expand"(%11878, %11882, %11883) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %11885 = "torch_c.to_builtin_tensor"(%11876) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11886 = "torch_c.to_builtin_tensor"(%11884) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %11887 = "util.call"(%11885, %11886) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %11888 = "torch_c.from_builtin_tensor"(%11887) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %11889 = "torch.aten.div.Tensor"(%11888, %1024) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %11890 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11891 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11892 = "torch.aten.clamp"(%11889, %11890, %11891) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %11893 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11894 = "torch.prims.convert_element_type"(%11892, %11893) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %11895 = "torch.aten.div.Tensor"(%11846, %1026) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %11896 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11897 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11898 = "torch.aten.clamp"(%11895, %11896, %11897) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %11899 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11900 = "torch.prims.convert_element_type"(%11898, %11899) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %11901 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11902 = "torch.aten.unsqueeze"(%1028, %11901) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %11903 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11904 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %11905 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %11906 = "torch.prim.ListConstruct"(%11903, %11904, %11905) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11907 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11908 = "torch.aten.expand"(%11902, %11906, %11907) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %11909 = "torch_c.to_builtin_tensor"(%11900) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %11910 = "torch_c.to_builtin_tensor"(%11908) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %11911 = "util.call"(%11909, %11910) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %11912 = "torch_c.from_builtin_tensor"(%11911) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %11913 = "torch.aten.div.Tensor"(%11912, %1030) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %11914 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %11915 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %11916 = "torch.aten.clamp"(%11913, %11914, %11915) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %11917 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %11918 = "torch.prims.convert_element_type"(%11916, %11917) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %11919 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11920 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11921 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11922 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11923 = "torch.prim.ListConstruct"(%11919, %11920, %11921, %11922) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11924 = "torch.aten.view"(%11870, %11923) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %11925 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11926 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11927 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11928 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11929 = "torch.prim.ListConstruct"(%11925, %11926, %11927, %11928) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11930 = "torch.aten.view"(%11894, %11929) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %11931 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %11932 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11933 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %11934 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %11935 = "torch.prim.ListConstruct"(%11931, %11932, %11933, %11934) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11936 = "torch.aten.view"(%11918, %11935) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %11937 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11938 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11939 = "torch.aten.transpose.int"(%11924, %11937, %11938) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11940 = "torch.aten.mul.Tensor"(%11939, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11941 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11942 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11943 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11944 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11945 = "torch.aten.slice.Tensor"(%11939, %11941, %11942, %11943, %11944) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %11946 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11947 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11948 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11949 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11950 = "torch.aten.slice.Tensor"(%11939, %11946, %11947, %11948, %11949) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %11951 = "torch.aten.neg"(%11950) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %11952 = "torch.prim.ListConstruct"(%11951, %11945) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %11953 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11954 = "torch.aten.cat"(%11952, %11953) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11955 = "torch.aten.mul.Tensor"(%11954, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11956 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11957 = "torch.aten.add.Tensor"(%11940, %11955, %11956) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %11958 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11959 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11960 = "torch.aten.transpose.int"(%11957, %11958, %11959) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %11961 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11962 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11963 = "torch.aten.transpose.int"(%11930, %11961, %11962) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11964 = "torch.aten.mul.Tensor"(%11963, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11965 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11966 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %11967 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11968 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11969 = "torch.aten.slice.Tensor"(%11963, %11965, %11966, %11967, %11968) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11970 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %11971 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %11972 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %11973 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11974 = "torch.aten.slice.Tensor"(%11963, %11970, %11971, %11972, %11973) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11975 = "torch.aten.neg"(%11974) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %11976 = "torch.prim.ListConstruct"(%11975, %11969) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %11977 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %11978 = "torch.aten.cat"(%11976, %11977) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11979 = "torch.aten.mul.Tensor"(%11978, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11980 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11981 = "torch.aten.add.Tensor"(%11964, %11979, %11980) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %11982 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11983 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %11984 = "torch.aten.transpose.int"(%11981, %11982, %11983) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %11985 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11986 = "torch.aten.floor_divide.Scalar"(%arg64, %11985) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11987 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11988 = "torch.aten.unsqueeze"(%11986, %11987) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11989 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11990 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %11991 = "torch.aten.gather"(%arg65, %11989, %11988, %11990) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %11992 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %11993 = "torch.aten.remainder.Scalar"(%arg64, %11992) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %11994 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %11995 = "torch.aten.unsqueeze"(%11993, %11994) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %11996 = "torch.constant.none"() : () -> !torch.none
    %11997 = "torch.aten.clone"(%1031, %11996) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %11998 = "torch.aten.detach"(%11997) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %11999 = "torch.aten.detach"(%11998) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12000 = "torch.aten.detach"(%11999) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12001 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12002 = "torch.aten.unsqueeze"(%12000, %12001) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %12003 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12004 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12005 = "torch.prim.ListConstruct"(%12003, %12004) : (!torch.int, !torch.int) -> !torch.list<int>
    %12006 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12007 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12008 = "torch.prim.ListConstruct"(%12006, %12007) : (!torch.int, !torch.int) -> !torch.list<int>
    %12009 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12010 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12011 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %12012 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12013 = "torch.aten.empty_strided"(%12005, %12008, %12009, %12010, %12011, %12012) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12014 = "torch.constant.int"() <{value = 21 : i64}> : () -> !torch.int
    %12015 = "torch.aten.fill.Scalar"(%12013, %12014) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12016 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12017 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12018 = "torch.prim.ListConstruct"(%12016, %12017) : (!torch.int, !torch.int) -> !torch.list<int>
    %12019 = "torch.aten.repeat"(%12002, %12018) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %12020 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12021 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12022 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12023 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12024 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12025 = "torch.prim.ListConstruct"(%1483, %12020, %12021, %12022, %12023, %12024) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12026 = "torch.aten.view"(%11596, %12025) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12026, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12027 = "torch.prim.ListConstruct"(%11991, %12015, %12019, %11995) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %12028 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12029 = "torch.aten.index_put"(%12026, %12027, %11984, %12028) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12029, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12030 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %12031 = "torch.prim.ListConstruct"(%1483, %12030) : (!torch.int, !torch.int) -> !torch.list<int>
    %12032 = "torch.aten.view"(%12029, %12031) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12032, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %12033 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12034 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12035 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12036 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12037 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12038 = "torch.prim.ListConstruct"(%1483, %12033, %12034, %12035, %12036, %12037) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12039 = "torch.aten.view"(%12032, %12038) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12039, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12040 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12041 = "torch.aten.floor_divide.Scalar"(%arg64, %12040) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12042 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12043 = "torch.aten.unsqueeze"(%12041, %12042) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12044 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12045 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12046 = "torch.aten.gather"(%arg65, %12044, %12043, %12045) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12047 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12048 = "torch.aten.remainder.Scalar"(%arg64, %12047) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12049 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12050 = "torch.aten.unsqueeze"(%12048, %12049) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12051 = "torch.constant.none"() : () -> !torch.none
    %12052 = "torch.aten.clone"(%1032, %12051) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %12053 = "torch.aten.detach"(%12052) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12054 = "torch.aten.detach"(%12053) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12055 = "torch.aten.detach"(%12054) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12056 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12057 = "torch.aten.unsqueeze"(%12055, %12056) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %12058 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12059 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12060 = "torch.prim.ListConstruct"(%12058, %12059) : (!torch.int, !torch.int) -> !torch.list<int>
    %12061 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12062 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12063 = "torch.prim.ListConstruct"(%12061, %12062) : (!torch.int, !torch.int) -> !torch.list<int>
    %12064 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12065 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12066 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %12067 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12068 = "torch.aten.empty_strided"(%12060, %12063, %12064, %12065, %12066, %12067) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12069 = "torch.constant.int"() <{value = 21 : i64}> : () -> !torch.int
    %12070 = "torch.aten.fill.Scalar"(%12068, %12069) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12071 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12072 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12073 = "torch.prim.ListConstruct"(%12071, %12072) : (!torch.int, !torch.int) -> !torch.list<int>
    %12074 = "torch.aten.repeat"(%12057, %12073) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %12075 = "torch.prim.ListConstruct"(%12046, %12070, %12074, %12050) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %12076 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12077 = "torch.aten.index_put"(%12039, %12075, %11936, %12076) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12077, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12078 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %12079 = "torch.prim.ListConstruct"(%1483, %12078) : (!torch.int, !torch.int) -> !torch.list<int>
    %12080 = "torch.aten.view"(%12077, %12079) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12080, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %12081 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12082 = "torch.aten.mul.Scalar"(%arg65, %12081) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12082, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12083 = "torch.constant.int"() <{value = 42 : i64}> : () -> !torch.int
    %12084 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12085 = "torch.aten.add.Scalar"(%12082, %12083, %12084) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12085, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12086 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12087 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12088 = "torch.aten.add.Scalar"(%12085, %12086, %12087) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12088, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12089 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %12090 = "torch.aten.view"(%12088, %12089) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%12090, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %12091 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12092 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12093 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12094 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12095 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12096 = "torch.prim.ListConstruct"(%1483, %12091, %12092, %12093, %12094, %12095) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12097 = "torch.aten.view"(%12080, %12096) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12097, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12098 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12099 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12100 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12101 = "torch.prim.ListConstruct"(%1914, %12098, %12099, %12100) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12102 = "torch.aten.view"(%12097, %12101) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12102, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12103 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12104 = "torch.aten.index_select"(%12102, %12103, %12090) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12104, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12105 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12106 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12107 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12108 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12109 = "torch.prim.ListConstruct"(%12105, %1481, %12106, %12107, %12108) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12110 = "torch.aten.view"(%12104, %12109) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12110, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12111 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12112 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12113 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12114 = "torch.prim.ListConstruct"(%12111, %1485, %12112, %12113) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12115 = "torch.aten.view"(%12110, %12114) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12115, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12116 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12117 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12118 = "torch.aten.add.Scalar"(%12085, %12116, %12117) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12118, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12119 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %12120 = "torch.aten.view"(%12118, %12119) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%12120, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %12121 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12122 = "torch.aten.index_select"(%12102, %12121, %12120) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12122, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12123 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12124 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12125 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12126 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12127 = "torch.prim.ListConstruct"(%12123, %1481, %12124, %12125, %12126) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12128 = "torch.aten.view"(%12122, %12127) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12128, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12129 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12130 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12131 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12132 = "torch.prim.ListConstruct"(%12129, %1485, %12130, %12131) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12133 = "torch.aten.view"(%12128, %12132) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12133, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12134 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12135 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12136 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12137 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12138 = "torch.aten.slice.Tensor"(%12115, %12134, %12135, %12136, %12137) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12138, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12139 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12140 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12141 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12142 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12143 = "torch.aten.slice.Tensor"(%12133, %12139, %12140, %12141, %12142) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12143, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12144 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %12145 = "torch.aten.unsqueeze"(%12138, %12144) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12145, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12146 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12147 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12148 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12149 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12150 = "torch.prim.ListConstruct"(%12146, %1485, %12147, %12148, %12149) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12151 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12152 = "torch.aten.expand"(%12145, %12150, %12151) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12152, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12153 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12154 = "torch.aten.clone"(%12152, %12153) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12154, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12155 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12156 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12157 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12158 = "torch.prim.ListConstruct"(%12155, %1485, %12156, %12157) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12159 = "torch.aten._unsafe_view"(%12154, %12158) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12159, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12160 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %12161 = "torch.aten.unsqueeze"(%12143, %12160) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12161, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12162 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12163 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12164 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12165 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12166 = "torch.prim.ListConstruct"(%12162, %1485, %12163, %12164, %12165) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12167 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12168 = "torch.aten.expand"(%12161, %12166, %12167) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12168, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12169 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12170 = "torch.aten.clone"(%12168, %12169) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12170, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12171 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12172 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12173 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12174 = "torch.prim.ListConstruct"(%12171, %1485, %12172, %12173) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12175 = "torch.aten._unsafe_view"(%12170, %12174) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12175, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12176 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12177 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12178 = "torch.aten.transpose.int"(%11960, %12176, %12177) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12179 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12180 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12181 = "torch.aten.transpose.int"(%12159, %12179, %12180) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12181, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12183 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12184 = "torch.aten.transpose.int"(%12175, %12182, %12183) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12184, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12185 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12186 = "torch.aten.squeeze.dim"(%1516, %12185) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12186, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %12187 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12188 = "torch.aten.squeeze.dim"(%12186, %12187) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12188, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %12189 = "torch_c.to_builtin_tensor"(%12178) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %12190 = "tensor.cast"(%12189) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %12191 = "torch_c.to_builtin_tensor"(%12181) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %12192 = "torch_c.to_builtin_tensor"(%12184) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %12193 = "torch_c.to_builtin_tensor"(%12188) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %12194 = "tensor.cast"(%12193) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %12195 = "torch_c.to_builtin_tensor"(%1034) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %12196 = "util.call"(%12190, %12191, %12192, %12195, %12194) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %12197 = "tensor.cast"(%12196) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %12198 = "torch_c.from_builtin_tensor"(%12197) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %12199 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12200 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12201 = "torch.aten.transpose.int"(%12198, %12199, %12200) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %12202 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12203 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12204 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12205 = "torch.prim.ListConstruct"(%12202, %12203, %12204) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12206 = "torch.aten.view"(%12201, %12205) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %12207 = "torch.aten.div.Tensor"(%12206, %1036) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12208 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12209 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12210 = "torch.aten.clamp"(%12207, %12208, %12209) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %12211 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12212 = "torch.prims.convert_element_type"(%12210, %12211) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12213 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12214 = "torch.aten.unsqueeze"(%1038, %12213) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %12215 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12216 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12217 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12218 = "torch.prim.ListConstruct"(%12215, %12216, %12217) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12219 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12220 = "torch.aten.expand"(%12214, %12218, %12219) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %12221 = "torch_c.to_builtin_tensor"(%12212) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12222 = "torch_c.to_builtin_tensor"(%12220) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %12223 = "util.call"(%12221, %12222) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %12224 = "torch_c.from_builtin_tensor"(%12223) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %12225 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12226 = "torch.prims.convert_element_type"(%12224, %12225) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12227 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12228 = "torch.aten.add.Tensor"(%11827, %12226, %12227) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12229 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %12230 = "torch.prims.convert_element_type"(%12228, %12229) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12231 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12232 = "torch.aten.pow.Tensor_Scalar"(%12230, %12231) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12233 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12234 = "torch.prim.ListConstruct"(%12233) : (!torch.int) -> !torch.list<int>
    %12235 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %12236 = "torch.constant.none"() : () -> !torch.none
    %12237 = "torch.aten.mean.dim"(%12232, %12234, %12235, %12236) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %12238 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %12239 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12240 = "torch.aten.add.Scalar"(%12237, %12238, %12239) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %12241 = "torch.aten.rsqrt"(%12240) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %12242 = "torch.aten.mul.Tensor"(%12230, %12241) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12243 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12244 = "torch.prims.convert_element_type"(%12242, %12243) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12245 = "torch.aten.mul.Tensor"(%1040, %12244) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %12246 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12247 = "torch.prims.convert_element_type"(%12245, %12246) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12248 = "torch.aten.div.Tensor"(%12247, %1042) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12249 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12250 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12251 = "torch.aten.clamp"(%12248, %12249, %12250) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12252 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12253 = "torch.prims.convert_element_type"(%12251, %12252) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12254 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12255 = "torch.aten.unsqueeze"(%1044, %12254) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %12256 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12257 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %12258 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12259 = "torch.prim.ListConstruct"(%12256, %12257, %12258) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12260 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12261 = "torch.aten.expand"(%12255, %12259, %12260) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %12262 = "torch_c.to_builtin_tensor"(%12253) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12263 = "torch_c.to_builtin_tensor"(%12261) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %12264 = "util.call"(%12262, %12263) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %12265 = "torch_c.from_builtin_tensor"(%12264) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %12266 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12267 = "torch.prims.convert_element_type"(%12265, %12266) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %12268 = "torch.aten.silu"(%12267) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %12269 = "torch.aten.div.Tensor"(%12247, %1046) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12270 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12271 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12272 = "torch.aten.clamp"(%12269, %12270, %12271) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12273 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12274 = "torch.prims.convert_element_type"(%12272, %12273) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12275 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12276 = "torch.aten.unsqueeze"(%1048, %12275) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %12277 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12278 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %12279 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12280 = "torch.prim.ListConstruct"(%12277, %12278, %12279) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12281 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12282 = "torch.aten.expand"(%12276, %12280, %12281) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %12283 = "torch_c.to_builtin_tensor"(%12274) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12284 = "torch_c.to_builtin_tensor"(%12282) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %12285 = "util.call"(%12283, %12284) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %12286 = "torch_c.from_builtin_tensor"(%12285) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %12287 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12288 = "torch.prims.convert_element_type"(%12286, %12287) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %12289 = "torch.aten.mul.Tensor"(%12268, %12288) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %12290 = "torch.aten.div.Tensor"(%12289, %1050) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %12291 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12292 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12293 = "torch.aten.clamp"(%12290, %12291, %12292) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %12294 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12295 = "torch.prims.convert_element_type"(%12293, %12294) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %12296 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12297 = "torch.aten.unsqueeze"(%1052, %12296) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %12298 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12299 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12300 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %12301 = "torch.prim.ListConstruct"(%12298, %12299, %12300) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12302 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12303 = "torch.aten.expand"(%12297, %12301, %12302) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %12304 = "torch_c.to_builtin_tensor"(%12295) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %12305 = "torch_c.to_builtin_tensor"(%12303) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %12306 = "util.call"(%12304, %12305) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %12307 = "torch_c.from_builtin_tensor"(%12306) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %12308 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12309 = "torch.prims.convert_element_type"(%12307, %12308) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12310 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12311 = "torch.aten.add.Tensor"(%12228, %12309, %12310) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12312 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %12313 = "torch.prims.convert_element_type"(%12311, %12312) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12314 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12315 = "torch.aten.pow.Tensor_Scalar"(%12313, %12314) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12316 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12317 = "torch.prim.ListConstruct"(%12316) : (!torch.int) -> !torch.list<int>
    %12318 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %12319 = "torch.constant.none"() : () -> !torch.none
    %12320 = "torch.aten.mean.dim"(%12315, %12317, %12318, %12319) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %12321 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %12322 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12323 = "torch.aten.add.Scalar"(%12320, %12321, %12322) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %12324 = "torch.aten.rsqrt"(%12323) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %12325 = "torch.aten.mul.Tensor"(%12313, %12324) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12326 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12327 = "torch.prims.convert_element_type"(%12325, %12326) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12328 = "torch.aten.mul.Tensor"(%1054, %12327) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %12329 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12330 = "torch.prims.convert_element_type"(%12328, %12329) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12331 = "torch.aten.div.Tensor"(%12330, %1056) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12332 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12333 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12334 = "torch.aten.clamp"(%12331, %12332, %12333) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12335 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12336 = "torch.prims.convert_element_type"(%12334, %12335) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12337 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12338 = "torch.aten.unsqueeze"(%1058, %12337) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %12339 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12340 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12341 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12342 = "torch.prim.ListConstruct"(%12339, %12340, %12341) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12343 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12344 = "torch.aten.expand"(%12338, %12342, %12343) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %12345 = "torch_c.to_builtin_tensor"(%12336) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12346 = "torch_c.to_builtin_tensor"(%12344) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %12347 = "util.call"(%12345, %12346) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %12348 = "torch_c.from_builtin_tensor"(%12347) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %12349 = "torch.aten.div.Tensor"(%12348, %1060) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12350 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12351 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12352 = "torch.aten.clamp"(%12349, %12350, %12351) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %12353 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12354 = "torch.prims.convert_element_type"(%12352, %12353) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12355 = "torch.aten.div.Tensor"(%12330, %1062) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12356 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12357 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12358 = "torch.aten.clamp"(%12355, %12356, %12357) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12359 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12360 = "torch.prims.convert_element_type"(%12358, %12359) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12361 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12362 = "torch.aten.unsqueeze"(%1064, %12361) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %12363 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12364 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %12365 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12366 = "torch.prim.ListConstruct"(%12363, %12364, %12365) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12367 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12368 = "torch.aten.expand"(%12362, %12366, %12367) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %12369 = "torch_c.to_builtin_tensor"(%12360) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12370 = "torch_c.to_builtin_tensor"(%12368) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %12371 = "util.call"(%12369, %12370) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %12372 = "torch_c.from_builtin_tensor"(%12371) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %12373 = "torch.aten.div.Tensor"(%12372, %1066) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %12374 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12375 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12376 = "torch.aten.clamp"(%12373, %12374, %12375) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %12377 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12378 = "torch.prims.convert_element_type"(%12376, %12377) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %12379 = "torch.aten.div.Tensor"(%12330, %1068) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12380 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12381 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12382 = "torch.aten.clamp"(%12379, %12380, %12381) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12383 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12384 = "torch.prims.convert_element_type"(%12382, %12383) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12385 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12386 = "torch.aten.unsqueeze"(%1070, %12385) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %12387 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12388 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %12389 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12390 = "torch.prim.ListConstruct"(%12387, %12388, %12389) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12391 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12392 = "torch.aten.expand"(%12386, %12390, %12391) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %12393 = "torch_c.to_builtin_tensor"(%12384) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12394 = "torch_c.to_builtin_tensor"(%12392) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %12395 = "util.call"(%12393, %12394) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %12396 = "torch_c.from_builtin_tensor"(%12395) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %12397 = "torch.aten.div.Tensor"(%12396, %1072) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %12398 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12399 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12400 = "torch.aten.clamp"(%12397, %12398, %12399) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %12401 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12402 = "torch.prims.convert_element_type"(%12400, %12401) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %12403 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12404 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12405 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12406 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12407 = "torch.prim.ListConstruct"(%12403, %12404, %12405, %12406) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12408 = "torch.aten.view"(%12354, %12407) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %12409 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12410 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12411 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12412 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12413 = "torch.prim.ListConstruct"(%12409, %12410, %12411, %12412) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12414 = "torch.aten.view"(%12378, %12413) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %12415 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12416 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12417 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12418 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12419 = "torch.prim.ListConstruct"(%12415, %12416, %12417, %12418) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12420 = "torch.aten.view"(%12402, %12419) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %12421 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12422 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12423 = "torch.aten.transpose.int"(%12408, %12421, %12422) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12424 = "torch.aten.mul.Tensor"(%12423, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12425 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12426 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12427 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12428 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12429 = "torch.aten.slice.Tensor"(%12423, %12425, %12426, %12427, %12428) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %12430 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12431 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12432 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12433 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12434 = "torch.aten.slice.Tensor"(%12423, %12430, %12431, %12432, %12433) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %12435 = "torch.aten.neg"(%12434) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %12436 = "torch.prim.ListConstruct"(%12435, %12429) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %12437 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12438 = "torch.aten.cat"(%12436, %12437) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12439 = "torch.aten.mul.Tensor"(%12438, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12441 = "torch.aten.add.Tensor"(%12424, %12439, %12440) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12442 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12443 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12444 = "torch.aten.transpose.int"(%12441, %12442, %12443) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %12445 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12446 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12447 = "torch.aten.transpose.int"(%12414, %12445, %12446) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12448 = "torch.aten.mul.Tensor"(%12447, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12449 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12450 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12451 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12452 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12453 = "torch.aten.slice.Tensor"(%12447, %12449, %12450, %12451, %12452) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %12454 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12455 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12456 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12457 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12458 = "torch.aten.slice.Tensor"(%12447, %12454, %12455, %12456, %12457) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %12459 = "torch.aten.neg"(%12458) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %12460 = "torch.prim.ListConstruct"(%12459, %12453) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %12461 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12462 = "torch.aten.cat"(%12460, %12461) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12463 = "torch.aten.mul.Tensor"(%12462, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12464 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12465 = "torch.aten.add.Tensor"(%12448, %12463, %12464) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12466 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12467 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12468 = "torch.aten.transpose.int"(%12465, %12466, %12467) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %12469 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12470 = "torch.aten.floor_divide.Scalar"(%arg64, %12469) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12471 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12472 = "torch.aten.unsqueeze"(%12470, %12471) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12473 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12474 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12475 = "torch.aten.gather"(%arg65, %12473, %12472, %12474) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12476 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12477 = "torch.aten.remainder.Scalar"(%arg64, %12476) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12478 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12479 = "torch.aten.unsqueeze"(%12477, %12478) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12480 = "torch.constant.none"() : () -> !torch.none
    %12481 = "torch.aten.clone"(%1073, %12480) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %12482 = "torch.aten.detach"(%12481) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12483 = "torch.aten.detach"(%12482) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12484 = "torch.aten.detach"(%12483) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12485 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12486 = "torch.aten.unsqueeze"(%12484, %12485) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %12487 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12488 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12489 = "torch.prim.ListConstruct"(%12487, %12488) : (!torch.int, !torch.int) -> !torch.list<int>
    %12490 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12491 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12492 = "torch.prim.ListConstruct"(%12490, %12491) : (!torch.int, !torch.int) -> !torch.list<int>
    %12493 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12494 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12495 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %12496 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12497 = "torch.aten.empty_strided"(%12489, %12492, %12493, %12494, %12495, %12496) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12498 = "torch.constant.int"() <{value = 22 : i64}> : () -> !torch.int
    %12499 = "torch.aten.fill.Scalar"(%12497, %12498) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12500 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12501 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12502 = "torch.prim.ListConstruct"(%12500, %12501) : (!torch.int, !torch.int) -> !torch.list<int>
    %12503 = "torch.aten.repeat"(%12486, %12502) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %12504 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12505 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12506 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12507 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12508 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12509 = "torch.prim.ListConstruct"(%1483, %12504, %12505, %12506, %12507, %12508) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12510 = "torch.aten.view"(%12080, %12509) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12510, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12511 = "torch.prim.ListConstruct"(%12475, %12499, %12503, %12479) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %12512 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12513 = "torch.aten.index_put"(%12510, %12511, %12468, %12512) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12513, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12514 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %12515 = "torch.prim.ListConstruct"(%1483, %12514) : (!torch.int, !torch.int) -> !torch.list<int>
    %12516 = "torch.aten.view"(%12513, %12515) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12516, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %12517 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12518 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12519 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12520 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12521 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12522 = "torch.prim.ListConstruct"(%1483, %12517, %12518, %12519, %12520, %12521) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12523 = "torch.aten.view"(%12516, %12522) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12523, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12524 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12525 = "torch.aten.floor_divide.Scalar"(%arg64, %12524) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12526 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12527 = "torch.aten.unsqueeze"(%12525, %12526) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12528 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12529 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12530 = "torch.aten.gather"(%arg65, %12528, %12527, %12529) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12531 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12532 = "torch.aten.remainder.Scalar"(%arg64, %12531) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12534 = "torch.aten.unsqueeze"(%12532, %12533) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12535 = "torch.constant.none"() : () -> !torch.none
    %12536 = "torch.aten.clone"(%1074, %12535) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %12537 = "torch.aten.detach"(%12536) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12538 = "torch.aten.detach"(%12537) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12539 = "torch.aten.detach"(%12538) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12540 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12541 = "torch.aten.unsqueeze"(%12539, %12540) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %12542 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12543 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12544 = "torch.prim.ListConstruct"(%12542, %12543) : (!torch.int, !torch.int) -> !torch.list<int>
    %12545 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12546 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12547 = "torch.prim.ListConstruct"(%12545, %12546) : (!torch.int, !torch.int) -> !torch.list<int>
    %12548 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12549 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12550 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %12551 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12552 = "torch.aten.empty_strided"(%12544, %12547, %12548, %12549, %12550, %12551) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12553 = "torch.constant.int"() <{value = 22 : i64}> : () -> !torch.int
    %12554 = "torch.aten.fill.Scalar"(%12552, %12553) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12555 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12556 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12557 = "torch.prim.ListConstruct"(%12555, %12556) : (!torch.int, !torch.int) -> !torch.list<int>
    %12558 = "torch.aten.repeat"(%12541, %12557) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %12559 = "torch.prim.ListConstruct"(%12530, %12554, %12558, %12534) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %12560 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12561 = "torch.aten.index_put"(%12523, %12559, %12420, %12560) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12561, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12562 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %12563 = "torch.prim.ListConstruct"(%1483, %12562) : (!torch.int, !torch.int) -> !torch.list<int>
    %12564 = "torch.aten.view"(%12561, %12563) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12564, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %12565 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12566 = "torch.aten.mul.Scalar"(%arg65, %12565) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12566, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12567 = "torch.constant.int"() <{value = 44 : i64}> : () -> !torch.int
    %12568 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12569 = "torch.aten.add.Scalar"(%12566, %12567, %12568) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12569, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12570 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12571 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12572 = "torch.aten.add.Scalar"(%12569, %12570, %12571) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12572, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12573 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %12574 = "torch.aten.view"(%12572, %12573) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%12574, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %12575 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12576 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12577 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12578 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12579 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12580 = "torch.prim.ListConstruct"(%1483, %12575, %12576, %12577, %12578, %12579) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12581 = "torch.aten.view"(%12564, %12580) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12581, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12582 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12583 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12584 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12585 = "torch.prim.ListConstruct"(%1914, %12582, %12583, %12584) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12586 = "torch.aten.view"(%12581, %12585) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12586, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12587 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12588 = "torch.aten.index_select"(%12586, %12587, %12574) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12588, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12589 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12590 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12591 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12592 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12593 = "torch.prim.ListConstruct"(%12589, %1481, %12590, %12591, %12592) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12594 = "torch.aten.view"(%12588, %12593) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12594, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12595 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12596 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12597 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12598 = "torch.prim.ListConstruct"(%12595, %1485, %12596, %12597) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12599 = "torch.aten.view"(%12594, %12598) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12599, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12600 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12601 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12602 = "torch.aten.add.Scalar"(%12569, %12600, %12601) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%12602, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %12603 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %12604 = "torch.aten.view"(%12602, %12603) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%12604, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %12605 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12606 = "torch.aten.index_select"(%12586, %12605, %12604) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12606, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12607 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12608 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12609 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12610 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12611 = "torch.prim.ListConstruct"(%12607, %1481, %12608, %12609, %12610) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12612 = "torch.aten.view"(%12606, %12611) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12612, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12613 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12614 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12615 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12616 = "torch.prim.ListConstruct"(%12613, %1485, %12614, %12615) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12617 = "torch.aten.view"(%12612, %12616) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12617, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12618 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12619 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12620 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12621 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12622 = "torch.aten.slice.Tensor"(%12599, %12618, %12619, %12620, %12621) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12622, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12623 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12624 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12625 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12626 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12627 = "torch.aten.slice.Tensor"(%12617, %12623, %12624, %12625, %12626) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12627, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12628 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %12629 = "torch.aten.unsqueeze"(%12622, %12628) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12629, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12630 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12631 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12632 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12633 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12634 = "torch.prim.ListConstruct"(%12630, %1485, %12631, %12632, %12633) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12635 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12636 = "torch.aten.expand"(%12629, %12634, %12635) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12636, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12637 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12638 = "torch.aten.clone"(%12636, %12637) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12638, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12639 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12640 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12641 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12642 = "torch.prim.ListConstruct"(%12639, %1485, %12640, %12641) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12643 = "torch.aten._unsafe_view"(%12638, %12642) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12643, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12644 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %12645 = "torch.aten.unsqueeze"(%12627, %12644) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12645, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12646 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12647 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12648 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12649 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12650 = "torch.prim.ListConstruct"(%12646, %1485, %12647, %12648, %12649) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12651 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12652 = "torch.aten.expand"(%12645, %12650, %12651) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12652, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12653 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12654 = "torch.aten.clone"(%12652, %12653) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12654, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12655 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12656 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12657 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12658 = "torch.prim.ListConstruct"(%12655, %1485, %12656, %12657) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12659 = "torch.aten._unsafe_view"(%12654, %12658) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12659, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12660 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12661 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12662 = "torch.aten.transpose.int"(%12444, %12660, %12661) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12663 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12664 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12665 = "torch.aten.transpose.int"(%12643, %12663, %12664) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12665, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12667 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12668 = "torch.aten.transpose.int"(%12659, %12666, %12667) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12668, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12669 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12670 = "torch.aten.squeeze.dim"(%1516, %12669) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12670, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %12671 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12672 = "torch.aten.squeeze.dim"(%12670, %12671) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12672, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %12673 = "torch_c.to_builtin_tensor"(%12662) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %12674 = "tensor.cast"(%12673) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %12675 = "torch_c.to_builtin_tensor"(%12665) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %12676 = "torch_c.to_builtin_tensor"(%12668) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %12677 = "torch_c.to_builtin_tensor"(%12672) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %12678 = "tensor.cast"(%12677) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %12679 = "torch_c.to_builtin_tensor"(%1076) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %12680 = "util.call"(%12674, %12675, %12676, %12679, %12678) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %12681 = "tensor.cast"(%12680) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %12682 = "torch_c.from_builtin_tensor"(%12681) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %12683 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12684 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12685 = "torch.aten.transpose.int"(%12682, %12683, %12684) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %12686 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12687 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12688 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12689 = "torch.prim.ListConstruct"(%12686, %12687, %12688) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12690 = "torch.aten.view"(%12685, %12689) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %12691 = "torch.aten.div.Tensor"(%12690, %1078) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12692 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12693 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12694 = "torch.aten.clamp"(%12691, %12692, %12693) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %12695 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12696 = "torch.prims.convert_element_type"(%12694, %12695) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12697 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12698 = "torch.aten.unsqueeze"(%1080, %12697) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %12699 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12700 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12701 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12702 = "torch.prim.ListConstruct"(%12699, %12700, %12701) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12703 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12704 = "torch.aten.expand"(%12698, %12702, %12703) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %12705 = "torch_c.to_builtin_tensor"(%12696) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12706 = "torch_c.to_builtin_tensor"(%12704) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %12707 = "util.call"(%12705, %12706) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %12708 = "torch_c.from_builtin_tensor"(%12707) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %12709 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12710 = "torch.prims.convert_element_type"(%12708, %12709) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12711 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12712 = "torch.aten.add.Tensor"(%12311, %12710, %12711) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12713 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %12714 = "torch.prims.convert_element_type"(%12712, %12713) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12715 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12716 = "torch.aten.pow.Tensor_Scalar"(%12714, %12715) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12717 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12718 = "torch.prim.ListConstruct"(%12717) : (!torch.int) -> !torch.list<int>
    %12719 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %12720 = "torch.constant.none"() : () -> !torch.none
    %12721 = "torch.aten.mean.dim"(%12716, %12718, %12719, %12720) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %12722 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %12723 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12724 = "torch.aten.add.Scalar"(%12721, %12722, %12723) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %12725 = "torch.aten.rsqrt"(%12724) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %12726 = "torch.aten.mul.Tensor"(%12714, %12725) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12727 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12728 = "torch.prims.convert_element_type"(%12726, %12727) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12729 = "torch.aten.mul.Tensor"(%1082, %12728) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %12730 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12731 = "torch.prims.convert_element_type"(%12729, %12730) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12732 = "torch.aten.div.Tensor"(%12731, %1084) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12733 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12734 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12735 = "torch.aten.clamp"(%12732, %12733, %12734) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12736 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12737 = "torch.prims.convert_element_type"(%12735, %12736) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12738 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12739 = "torch.aten.unsqueeze"(%1086, %12738) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %12740 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12741 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %12742 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12743 = "torch.prim.ListConstruct"(%12740, %12741, %12742) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12744 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12745 = "torch.aten.expand"(%12739, %12743, %12744) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %12746 = "torch_c.to_builtin_tensor"(%12737) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12747 = "torch_c.to_builtin_tensor"(%12745) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %12748 = "util.call"(%12746, %12747) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %12749 = "torch_c.from_builtin_tensor"(%12748) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %12750 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12751 = "torch.prims.convert_element_type"(%12749, %12750) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %12752 = "torch.aten.silu"(%12751) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %12753 = "torch.aten.div.Tensor"(%12731, %1088) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12754 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12755 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12756 = "torch.aten.clamp"(%12753, %12754, %12755) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12757 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12758 = "torch.prims.convert_element_type"(%12756, %12757) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12759 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12760 = "torch.aten.unsqueeze"(%1090, %12759) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %12761 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12762 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %12763 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12764 = "torch.prim.ListConstruct"(%12761, %12762, %12763) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12765 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12766 = "torch.aten.expand"(%12760, %12764, %12765) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %12767 = "torch_c.to_builtin_tensor"(%12758) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12768 = "torch_c.to_builtin_tensor"(%12766) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %12769 = "util.call"(%12767, %12768) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %12770 = "torch_c.from_builtin_tensor"(%12769) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %12771 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12772 = "torch.prims.convert_element_type"(%12770, %12771) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %12773 = "torch.aten.mul.Tensor"(%12752, %12772) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %12774 = "torch.aten.div.Tensor"(%12773, %1092) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %12775 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12776 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12777 = "torch.aten.clamp"(%12774, %12775, %12776) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %12778 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12779 = "torch.prims.convert_element_type"(%12777, %12778) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %12780 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12781 = "torch.aten.unsqueeze"(%1094, %12780) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %12782 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12783 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12784 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %12785 = "torch.prim.ListConstruct"(%12782, %12783, %12784) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12786 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12787 = "torch.aten.expand"(%12781, %12785, %12786) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %12788 = "torch_c.to_builtin_tensor"(%12779) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %12789 = "torch_c.to_builtin_tensor"(%12787) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %12790 = "util.call"(%12788, %12789) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %12791 = "torch_c.from_builtin_tensor"(%12790) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %12792 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12793 = "torch.prims.convert_element_type"(%12791, %12792) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12794 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12795 = "torch.aten.add.Tensor"(%12712, %12793, %12794) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12796 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %12797 = "torch.prims.convert_element_type"(%12795, %12796) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12798 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12799 = "torch.aten.pow.Tensor_Scalar"(%12797, %12798) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %12800 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12801 = "torch.prim.ListConstruct"(%12800) : (!torch.int) -> !torch.list<int>
    %12802 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %12803 = "torch.constant.none"() : () -> !torch.none
    %12804 = "torch.aten.mean.dim"(%12799, %12801, %12802, %12803) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %12805 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %12806 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12807 = "torch.aten.add.Scalar"(%12804, %12805, %12806) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %12808 = "torch.aten.rsqrt"(%12807) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %12809 = "torch.aten.mul.Tensor"(%12797, %12808) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12810 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12811 = "torch.prims.convert_element_type"(%12809, %12810) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12812 = "torch.aten.mul.Tensor"(%1096, %12811) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %12813 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %12814 = "torch.prims.convert_element_type"(%12812, %12813) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %12815 = "torch.aten.div.Tensor"(%12814, %1098) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12816 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12817 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12818 = "torch.aten.clamp"(%12815, %12816, %12817) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12819 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12820 = "torch.prims.convert_element_type"(%12818, %12819) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12821 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12822 = "torch.aten.unsqueeze"(%1100, %12821) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %12823 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12824 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12825 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12826 = "torch.prim.ListConstruct"(%12823, %12824, %12825) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12827 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12828 = "torch.aten.expand"(%12822, %12826, %12827) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %12829 = "torch_c.to_builtin_tensor"(%12820) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12830 = "torch_c.to_builtin_tensor"(%12828) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %12831 = "util.call"(%12829, %12830) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %12832 = "torch_c.from_builtin_tensor"(%12831) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %12833 = "torch.aten.div.Tensor"(%12832, %1102) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %12834 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12835 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12836 = "torch.aten.clamp"(%12833, %12834, %12835) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %12837 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12838 = "torch.prims.convert_element_type"(%12836, %12837) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12839 = "torch.aten.div.Tensor"(%12814, %1104) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12840 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12841 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12842 = "torch.aten.clamp"(%12839, %12840, %12841) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12843 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12844 = "torch.prims.convert_element_type"(%12842, %12843) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12845 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12846 = "torch.aten.unsqueeze"(%1106, %12845) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %12847 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12848 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %12849 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12850 = "torch.prim.ListConstruct"(%12847, %12848, %12849) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12851 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12852 = "torch.aten.expand"(%12846, %12850, %12851) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %12853 = "torch_c.to_builtin_tensor"(%12844) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12854 = "torch_c.to_builtin_tensor"(%12852) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %12855 = "util.call"(%12853, %12854) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %12856 = "torch_c.from_builtin_tensor"(%12855) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %12857 = "torch.aten.div.Tensor"(%12856, %1108) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %12858 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12859 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12860 = "torch.aten.clamp"(%12857, %12858, %12859) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %12861 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12862 = "torch.prims.convert_element_type"(%12860, %12861) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %12863 = "torch.aten.div.Tensor"(%12814, %1110) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %12864 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12865 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12866 = "torch.aten.clamp"(%12863, %12864, %12865) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %12867 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12868 = "torch.prims.convert_element_type"(%12866, %12867) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %12869 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12870 = "torch.aten.unsqueeze"(%1112, %12869) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %12871 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12872 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %12873 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %12874 = "torch.prim.ListConstruct"(%12871, %12872, %12873) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12875 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12876 = "torch.aten.expand"(%12870, %12874, %12875) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %12877 = "torch_c.to_builtin_tensor"(%12868) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %12878 = "torch_c.to_builtin_tensor"(%12876) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %12879 = "util.call"(%12877, %12878) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %12880 = "torch_c.from_builtin_tensor"(%12879) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %12881 = "torch.aten.div.Tensor"(%12880, %1114) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %12882 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %12883 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %12884 = "torch.aten.clamp"(%12881, %12882, %12883) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %12885 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %12886 = "torch.prims.convert_element_type"(%12884, %12885) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %12887 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12889 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12890 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12891 = "torch.prim.ListConstruct"(%12887, %12888, %12889, %12890) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12892 = "torch.aten.view"(%12838, %12891) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %12893 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12894 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12895 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12896 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12897 = "torch.prim.ListConstruct"(%12893, %12894, %12895, %12896) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12898 = "torch.aten.view"(%12862, %12897) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %12899 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12900 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12901 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12902 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12903 = "torch.prim.ListConstruct"(%12899, %12900, %12901, %12902) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12904 = "torch.aten.view"(%12886, %12903) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %12905 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12906 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12907 = "torch.aten.transpose.int"(%12892, %12905, %12906) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12908 = "torch.aten.mul.Tensor"(%12907, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12909 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12910 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12911 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12912 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12913 = "torch.aten.slice.Tensor"(%12907, %12909, %12910, %12911, %12912) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %12914 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12915 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12916 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12917 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12918 = "torch.aten.slice.Tensor"(%12907, %12914, %12915, %12916, %12917) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %12919 = "torch.aten.neg"(%12918) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %12920 = "torch.prim.ListConstruct"(%12919, %12913) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %12921 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12922 = "torch.aten.cat"(%12920, %12921) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12923 = "torch.aten.mul.Tensor"(%12922, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12924 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12925 = "torch.aten.add.Tensor"(%12908, %12923, %12924) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %12926 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12927 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12928 = "torch.aten.transpose.int"(%12925, %12926, %12927) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %12929 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12930 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12931 = "torch.aten.transpose.int"(%12898, %12929, %12930) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12932 = "torch.aten.mul.Tensor"(%12931, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12933 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12934 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12935 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12936 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12937 = "torch.aten.slice.Tensor"(%12931, %12933, %12934, %12935, %12936) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %12938 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %12939 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %12940 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %12941 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12942 = "torch.aten.slice.Tensor"(%12931, %12938, %12939, %12940, %12941) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %12943 = "torch.aten.neg"(%12942) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %12944 = "torch.prim.ListConstruct"(%12943, %12937) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %12945 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %12946 = "torch.aten.cat"(%12944, %12945) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12947 = "torch.aten.mul.Tensor"(%12946, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12948 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12949 = "torch.aten.add.Tensor"(%12932, %12947, %12948) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %12950 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12951 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12952 = "torch.aten.transpose.int"(%12949, %12950, %12951) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %12953 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12954 = "torch.aten.floor_divide.Scalar"(%arg64, %12953) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12955 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12956 = "torch.aten.unsqueeze"(%12954, %12955) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12957 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12958 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12959 = "torch.aten.gather"(%arg65, %12957, %12956, %12958) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12960 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12961 = "torch.aten.remainder.Scalar"(%arg64, %12960) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %12962 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12963 = "torch.aten.unsqueeze"(%12961, %12962) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12964 = "torch.constant.none"() : () -> !torch.none
    %12965 = "torch.aten.clone"(%1115, %12964) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %12966 = "torch.aten.detach"(%12965) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12967 = "torch.aten.detach"(%12966) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12968 = "torch.aten.detach"(%12967) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %12969 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12970 = "torch.aten.unsqueeze"(%12968, %12969) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %12971 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12972 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12973 = "torch.prim.ListConstruct"(%12971, %12972) : (!torch.int, !torch.int) -> !torch.list<int>
    %12974 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12975 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12976 = "torch.prim.ListConstruct"(%12974, %12975) : (!torch.int, !torch.int) -> !torch.list<int>
    %12977 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12978 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %12979 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %12980 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12981 = "torch.aten.empty_strided"(%12973, %12976, %12977, %12978, %12979, %12980) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %12982 = "torch.constant.int"() <{value = 23 : i64}> : () -> !torch.int
    %12983 = "torch.aten.fill.Scalar"(%12981, %12982) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %12984 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %12985 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %12986 = "torch.prim.ListConstruct"(%12984, %12985) : (!torch.int, !torch.int) -> !torch.list<int>
    %12987 = "torch.aten.repeat"(%12970, %12986) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %12988 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12989 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %12990 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %12991 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %12992 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %12993 = "torch.prim.ListConstruct"(%1483, %12988, %12989, %12990, %12991, %12992) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %12994 = "torch.aten.view"(%12564, %12993) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12994, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12995 = "torch.prim.ListConstruct"(%12959, %12983, %12987, %12963) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %12996 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %12997 = "torch.aten.index_put"(%12994, %12995, %12952, %12996) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%12997, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %12998 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %12999 = "torch.prim.ListConstruct"(%1483, %12998) : (!torch.int, !torch.int) -> !torch.list<int>
    %13000 = "torch.aten.view"(%12997, %12999) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13000, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %13001 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13002 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13003 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13004 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13005 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13006 = "torch.prim.ListConstruct"(%1483, %13001, %13002, %13003, %13004, %13005) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13007 = "torch.aten.view"(%13000, %13006) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13007, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13008 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13009 = "torch.aten.floor_divide.Scalar"(%arg64, %13008) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13010 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13011 = "torch.aten.unsqueeze"(%13009, %13010) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13012 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13013 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13014 = "torch.aten.gather"(%arg65, %13012, %13011, %13013) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13015 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13016 = "torch.aten.remainder.Scalar"(%arg64, %13015) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13017 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13018 = "torch.aten.unsqueeze"(%13016, %13017) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13019 = "torch.constant.none"() : () -> !torch.none
    %13020 = "torch.aten.clone"(%1116, %13019) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %13021 = "torch.aten.detach"(%13020) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13022 = "torch.aten.detach"(%13021) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13023 = "torch.aten.detach"(%13022) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13024 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13025 = "torch.aten.unsqueeze"(%13023, %13024) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %13026 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13027 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13028 = "torch.prim.ListConstruct"(%13026, %13027) : (!torch.int, !torch.int) -> !torch.list<int>
    %13029 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13030 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13031 = "torch.prim.ListConstruct"(%13029, %13030) : (!torch.int, !torch.int) -> !torch.list<int>
    %13032 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13033 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13034 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %13035 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13036 = "torch.aten.empty_strided"(%13028, %13031, %13032, %13033, %13034, %13035) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13037 = "torch.constant.int"() <{value = 23 : i64}> : () -> !torch.int
    %13038 = "torch.aten.fill.Scalar"(%13036, %13037) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13039 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13040 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13041 = "torch.prim.ListConstruct"(%13039, %13040) : (!torch.int, !torch.int) -> !torch.list<int>
    %13042 = "torch.aten.repeat"(%13025, %13041) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %13043 = "torch.prim.ListConstruct"(%13014, %13038, %13042, %13018) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %13044 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13045 = "torch.aten.index_put"(%13007, %13043, %12904, %13044) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13045, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13046 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %13047 = "torch.prim.ListConstruct"(%1483, %13046) : (!torch.int, !torch.int) -> !torch.list<int>
    %13048 = "torch.aten.view"(%13045, %13047) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13048, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %13049 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13050 = "torch.aten.mul.Scalar"(%arg65, %13049) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13050, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13051 = "torch.constant.int"() <{value = 46 : i64}> : () -> !torch.int
    %13052 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13053 = "torch.aten.add.Scalar"(%13050, %13051, %13052) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13053, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13054 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13055 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13056 = "torch.aten.add.Scalar"(%13053, %13054, %13055) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13056, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13057 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %13058 = "torch.aten.view"(%13056, %13057) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%13058, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %13059 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13060 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13061 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13062 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13063 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13064 = "torch.prim.ListConstruct"(%1483, %13059, %13060, %13061, %13062, %13063) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13065 = "torch.aten.view"(%13048, %13064) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13065, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13066 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13067 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13068 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13069 = "torch.prim.ListConstruct"(%1914, %13066, %13067, %13068) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13070 = "torch.aten.view"(%13065, %13069) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13070, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13071 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13072 = "torch.aten.index_select"(%13070, %13071, %13058) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13072, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13073 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13074 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13075 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13076 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13077 = "torch.prim.ListConstruct"(%13073, %1481, %13074, %13075, %13076) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13078 = "torch.aten.view"(%13072, %13077) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13078, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13079 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13080 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13081 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13082 = "torch.prim.ListConstruct"(%13079, %1485, %13080, %13081) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13083 = "torch.aten.view"(%13078, %13082) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13083, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13084 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13085 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13086 = "torch.aten.add.Scalar"(%13053, %13084, %13085) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13086, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13087 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %13088 = "torch.aten.view"(%13086, %13087) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%13088, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %13089 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13090 = "torch.aten.index_select"(%13070, %13089, %13088) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13090, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13091 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13092 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13093 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13094 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13095 = "torch.prim.ListConstruct"(%13091, %1481, %13092, %13093, %13094) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13096 = "torch.aten.view"(%13090, %13095) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13096, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13097 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13098 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13099 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13100 = "torch.prim.ListConstruct"(%13097, %1485, %13098, %13099) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13101 = "torch.aten.view"(%13096, %13100) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13101, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13102 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13103 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13104 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13105 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13106 = "torch.aten.slice.Tensor"(%13083, %13102, %13103, %13104, %13105) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13106, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13107 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13108 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13109 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13110 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13111 = "torch.aten.slice.Tensor"(%13101, %13107, %13108, %13109, %13110) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13111, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13112 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %13113 = "torch.aten.unsqueeze"(%13106, %13112) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13113, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13114 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13115 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13116 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13117 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13118 = "torch.prim.ListConstruct"(%13114, %1485, %13115, %13116, %13117) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13119 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13120 = "torch.aten.expand"(%13113, %13118, %13119) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13120, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13121 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13122 = "torch.aten.clone"(%13120, %13121) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13122, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13123 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13124 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13125 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13126 = "torch.prim.ListConstruct"(%13123, %1485, %13124, %13125) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13127 = "torch.aten._unsafe_view"(%13122, %13126) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13127, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13128 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %13129 = "torch.aten.unsqueeze"(%13111, %13128) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13129, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13130 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13131 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13132 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13133 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13134 = "torch.prim.ListConstruct"(%13130, %1485, %13131, %13132, %13133) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13135 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13136 = "torch.aten.expand"(%13129, %13134, %13135) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13136, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13137 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13138 = "torch.aten.clone"(%13136, %13137) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13138, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13139 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13140 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13141 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13142 = "torch.prim.ListConstruct"(%13139, %1485, %13140, %13141) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13143 = "torch.aten._unsafe_view"(%13138, %13142) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13143, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13144 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13145 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13146 = "torch.aten.transpose.int"(%12928, %13144, %13145) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13147 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13148 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13149 = "torch.aten.transpose.int"(%13127, %13147, %13148) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13149, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13150 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13151 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13152 = "torch.aten.transpose.int"(%13143, %13150, %13151) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13152, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13153 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13154 = "torch.aten.squeeze.dim"(%1516, %13153) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13154, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %13155 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13156 = "torch.aten.squeeze.dim"(%13154, %13155) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13156, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %13157 = "torch_c.to_builtin_tensor"(%13146) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %13158 = "tensor.cast"(%13157) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %13159 = "torch_c.to_builtin_tensor"(%13149) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %13160 = "torch_c.to_builtin_tensor"(%13152) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %13161 = "torch_c.to_builtin_tensor"(%13156) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %13162 = "tensor.cast"(%13161) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %13163 = "torch_c.to_builtin_tensor"(%1118) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %13164 = "util.call"(%13158, %13159, %13160, %13163, %13162) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %13165 = "tensor.cast"(%13164) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %13166 = "torch_c.from_builtin_tensor"(%13165) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %13167 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13168 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13169 = "torch.aten.transpose.int"(%13166, %13167, %13168) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %13170 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13171 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13172 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13173 = "torch.prim.ListConstruct"(%13170, %13171, %13172) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13174 = "torch.aten.view"(%13169, %13173) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %13175 = "torch.aten.div.Tensor"(%13174, %1120) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13176 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13177 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13178 = "torch.aten.clamp"(%13175, %13176, %13177) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %13179 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13180 = "torch.prims.convert_element_type"(%13178, %13179) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13181 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13182 = "torch.aten.unsqueeze"(%1122, %13181) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %13183 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13184 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13185 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13186 = "torch.prim.ListConstruct"(%13183, %13184, %13185) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13187 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13188 = "torch.aten.expand"(%13182, %13186, %13187) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %13189 = "torch_c.to_builtin_tensor"(%13180) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13190 = "torch_c.to_builtin_tensor"(%13188) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %13191 = "util.call"(%13189, %13190) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %13192 = "torch_c.from_builtin_tensor"(%13191) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %13193 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13194 = "torch.prims.convert_element_type"(%13192, %13193) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13195 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13196 = "torch.aten.add.Tensor"(%12795, %13194, %13195) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13197 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %13198 = "torch.prims.convert_element_type"(%13196, %13197) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13199 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13200 = "torch.aten.pow.Tensor_Scalar"(%13198, %13199) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13201 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13202 = "torch.prim.ListConstruct"(%13201) : (!torch.int) -> !torch.list<int>
    %13203 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %13204 = "torch.constant.none"() : () -> !torch.none
    %13205 = "torch.aten.mean.dim"(%13200, %13202, %13203, %13204) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %13206 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %13207 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13208 = "torch.aten.add.Scalar"(%13205, %13206, %13207) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %13209 = "torch.aten.rsqrt"(%13208) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %13210 = "torch.aten.mul.Tensor"(%13198, %13209) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13211 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13212 = "torch.prims.convert_element_type"(%13210, %13211) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13213 = "torch.aten.mul.Tensor"(%1124, %13212) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %13214 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13215 = "torch.prims.convert_element_type"(%13213, %13214) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13216 = "torch.aten.div.Tensor"(%13215, %1126) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13217 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13218 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13219 = "torch.aten.clamp"(%13216, %13217, %13218) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13220 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13221 = "torch.prims.convert_element_type"(%13219, %13220) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13222 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13223 = "torch.aten.unsqueeze"(%1128, %13222) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %13224 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13225 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %13226 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13227 = "torch.prim.ListConstruct"(%13224, %13225, %13226) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13228 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13229 = "torch.aten.expand"(%13223, %13227, %13228) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %13230 = "torch_c.to_builtin_tensor"(%13221) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13231 = "torch_c.to_builtin_tensor"(%13229) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %13232 = "util.call"(%13230, %13231) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %13233 = "torch_c.from_builtin_tensor"(%13232) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %13234 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13235 = "torch.prims.convert_element_type"(%13233, %13234) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %13236 = "torch.aten.silu"(%13235) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %13237 = "torch.aten.div.Tensor"(%13215, %1130) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13238 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13239 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13240 = "torch.aten.clamp"(%13237, %13238, %13239) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13241 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13242 = "torch.prims.convert_element_type"(%13240, %13241) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13243 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13244 = "torch.aten.unsqueeze"(%1132, %13243) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %13245 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13246 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %13247 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13248 = "torch.prim.ListConstruct"(%13245, %13246, %13247) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13249 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13250 = "torch.aten.expand"(%13244, %13248, %13249) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %13251 = "torch_c.to_builtin_tensor"(%13242) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13252 = "torch_c.to_builtin_tensor"(%13250) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %13253 = "util.call"(%13251, %13252) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %13254 = "torch_c.from_builtin_tensor"(%13253) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %13255 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13256 = "torch.prims.convert_element_type"(%13254, %13255) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %13257 = "torch.aten.mul.Tensor"(%13236, %13256) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %13258 = "torch.aten.div.Tensor"(%13257, %1134) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %13259 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13260 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13261 = "torch.aten.clamp"(%13258, %13259, %13260) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %13262 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13263 = "torch.prims.convert_element_type"(%13261, %13262) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %13264 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13265 = "torch.aten.unsqueeze"(%1136, %13264) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %13266 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13267 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13268 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %13269 = "torch.prim.ListConstruct"(%13266, %13267, %13268) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13270 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13271 = "torch.aten.expand"(%13265, %13269, %13270) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %13272 = "torch_c.to_builtin_tensor"(%13263) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %13273 = "torch_c.to_builtin_tensor"(%13271) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %13274 = "util.call"(%13272, %13273) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %13275 = "torch_c.from_builtin_tensor"(%13274) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %13276 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13277 = "torch.prims.convert_element_type"(%13275, %13276) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13278 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13279 = "torch.aten.add.Tensor"(%13196, %13277, %13278) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13280 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %13281 = "torch.prims.convert_element_type"(%13279, %13280) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13282 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13283 = "torch.aten.pow.Tensor_Scalar"(%13281, %13282) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13284 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13285 = "torch.prim.ListConstruct"(%13284) : (!torch.int) -> !torch.list<int>
    %13286 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %13287 = "torch.constant.none"() : () -> !torch.none
    %13288 = "torch.aten.mean.dim"(%13283, %13285, %13286, %13287) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %13289 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %13290 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13291 = "torch.aten.add.Scalar"(%13288, %13289, %13290) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %13292 = "torch.aten.rsqrt"(%13291) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %13293 = "torch.aten.mul.Tensor"(%13281, %13292) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13294 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13295 = "torch.prims.convert_element_type"(%13293, %13294) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13296 = "torch.aten.mul.Tensor"(%1138, %13295) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %13297 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13298 = "torch.prims.convert_element_type"(%13296, %13297) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13299 = "torch.aten.div.Tensor"(%13298, %1140) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13300 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13301 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13302 = "torch.aten.clamp"(%13299, %13300, %13301) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13303 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13304 = "torch.prims.convert_element_type"(%13302, %13303) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13305 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13306 = "torch.aten.unsqueeze"(%1142, %13305) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %13307 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13308 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13309 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13310 = "torch.prim.ListConstruct"(%13307, %13308, %13309) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13311 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13312 = "torch.aten.expand"(%13306, %13310, %13311) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %13313 = "torch_c.to_builtin_tensor"(%13304) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13314 = "torch_c.to_builtin_tensor"(%13312) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %13315 = "util.call"(%13313, %13314) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %13316 = "torch_c.from_builtin_tensor"(%13315) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %13317 = "torch.aten.div.Tensor"(%13316, %1144) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13318 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13319 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13320 = "torch.aten.clamp"(%13317, %13318, %13319) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %13321 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13322 = "torch.prims.convert_element_type"(%13320, %13321) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13323 = "torch.aten.div.Tensor"(%13298, %1146) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13324 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13325 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13326 = "torch.aten.clamp"(%13323, %13324, %13325) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13327 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13328 = "torch.prims.convert_element_type"(%13326, %13327) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13329 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13330 = "torch.aten.unsqueeze"(%1148, %13329) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %13331 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13332 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %13333 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13334 = "torch.prim.ListConstruct"(%13331, %13332, %13333) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13335 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13336 = "torch.aten.expand"(%13330, %13334, %13335) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %13337 = "torch_c.to_builtin_tensor"(%13328) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13338 = "torch_c.to_builtin_tensor"(%13336) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %13339 = "util.call"(%13337, %13338) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %13340 = "torch_c.from_builtin_tensor"(%13339) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %13341 = "torch.aten.div.Tensor"(%13340, %1150) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %13342 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13343 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13344 = "torch.aten.clamp"(%13341, %13342, %13343) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %13345 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13346 = "torch.prims.convert_element_type"(%13344, %13345) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %13347 = "torch.aten.div.Tensor"(%13298, %1152) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13348 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13349 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13350 = "torch.aten.clamp"(%13347, %13348, %13349) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13351 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13352 = "torch.prims.convert_element_type"(%13350, %13351) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13353 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13354 = "torch.aten.unsqueeze"(%1154, %13353) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %13355 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13356 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %13357 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13358 = "torch.prim.ListConstruct"(%13355, %13356, %13357) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13359 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13360 = "torch.aten.expand"(%13354, %13358, %13359) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %13361 = "torch_c.to_builtin_tensor"(%13352) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13362 = "torch_c.to_builtin_tensor"(%13360) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %13363 = "util.call"(%13361, %13362) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %13364 = "torch_c.from_builtin_tensor"(%13363) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %13365 = "torch.aten.div.Tensor"(%13364, %1156) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %13366 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13367 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13368 = "torch.aten.clamp"(%13365, %13366, %13367) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %13369 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13370 = "torch.prims.convert_element_type"(%13368, %13369) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %13371 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13372 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13373 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13374 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13375 = "torch.prim.ListConstruct"(%13371, %13372, %13373, %13374) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13376 = "torch.aten.view"(%13322, %13375) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %13377 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13378 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13379 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13380 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13381 = "torch.prim.ListConstruct"(%13377, %13378, %13379, %13380) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13382 = "torch.aten.view"(%13346, %13381) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %13383 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13384 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13385 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13386 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13387 = "torch.prim.ListConstruct"(%13383, %13384, %13385, %13386) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13388 = "torch.aten.view"(%13370, %13387) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %13389 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13390 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13391 = "torch.aten.transpose.int"(%13376, %13389, %13390) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13392 = "torch.aten.mul.Tensor"(%13391, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13393 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13394 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13395 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13396 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13397 = "torch.aten.slice.Tensor"(%13391, %13393, %13394, %13395, %13396) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %13398 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13399 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13400 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13401 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13402 = "torch.aten.slice.Tensor"(%13391, %13398, %13399, %13400, %13401) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %13403 = "torch.aten.neg"(%13402) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %13404 = "torch.prim.ListConstruct"(%13403, %13397) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %13405 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13406 = "torch.aten.cat"(%13404, %13405) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13407 = "torch.aten.mul.Tensor"(%13406, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13408 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13409 = "torch.aten.add.Tensor"(%13392, %13407, %13408) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13410 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13411 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13412 = "torch.aten.transpose.int"(%13409, %13410, %13411) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %13413 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13414 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13415 = "torch.aten.transpose.int"(%13382, %13413, %13414) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13416 = "torch.aten.mul.Tensor"(%13415, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13417 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13418 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13419 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13420 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13421 = "torch.aten.slice.Tensor"(%13415, %13417, %13418, %13419, %13420) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %13422 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13423 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13424 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13425 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13426 = "torch.aten.slice.Tensor"(%13415, %13422, %13423, %13424, %13425) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %13427 = "torch.aten.neg"(%13426) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %13428 = "torch.prim.ListConstruct"(%13427, %13421) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %13429 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13430 = "torch.aten.cat"(%13428, %13429) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13431 = "torch.aten.mul.Tensor"(%13430, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13432 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13433 = "torch.aten.add.Tensor"(%13416, %13431, %13432) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13434 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13435 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13436 = "torch.aten.transpose.int"(%13433, %13434, %13435) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %13437 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13438 = "torch.aten.floor_divide.Scalar"(%arg64, %13437) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13439 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13440 = "torch.aten.unsqueeze"(%13438, %13439) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13441 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13442 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13443 = "torch.aten.gather"(%arg65, %13441, %13440, %13442) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13444 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13445 = "torch.aten.remainder.Scalar"(%arg64, %13444) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13446 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13447 = "torch.aten.unsqueeze"(%13445, %13446) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13448 = "torch.constant.none"() : () -> !torch.none
    %13449 = "torch.aten.clone"(%1157, %13448) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %13450 = "torch.aten.detach"(%13449) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13451 = "torch.aten.detach"(%13450) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13452 = "torch.aten.detach"(%13451) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13453 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13454 = "torch.aten.unsqueeze"(%13452, %13453) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %13455 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13456 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13457 = "torch.prim.ListConstruct"(%13455, %13456) : (!torch.int, !torch.int) -> !torch.list<int>
    %13458 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13459 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13460 = "torch.prim.ListConstruct"(%13458, %13459) : (!torch.int, !torch.int) -> !torch.list<int>
    %13461 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13462 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13463 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %13464 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13465 = "torch.aten.empty_strided"(%13457, %13460, %13461, %13462, %13463, %13464) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13466 = "torch.constant.int"() <{value = 24 : i64}> : () -> !torch.int
    %13467 = "torch.aten.fill.Scalar"(%13465, %13466) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13468 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13469 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13470 = "torch.prim.ListConstruct"(%13468, %13469) : (!torch.int, !torch.int) -> !torch.list<int>
    %13471 = "torch.aten.repeat"(%13454, %13470) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %13472 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13473 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13474 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13475 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13476 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13477 = "torch.prim.ListConstruct"(%1483, %13472, %13473, %13474, %13475, %13476) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13478 = "torch.aten.view"(%13048, %13477) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13478, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13479 = "torch.prim.ListConstruct"(%13443, %13467, %13471, %13447) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %13480 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13481 = "torch.aten.index_put"(%13478, %13479, %13436, %13480) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13481, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13482 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %13483 = "torch.prim.ListConstruct"(%1483, %13482) : (!torch.int, !torch.int) -> !torch.list<int>
    %13484 = "torch.aten.view"(%13481, %13483) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13484, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %13485 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13486 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13487 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13488 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13489 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13490 = "torch.prim.ListConstruct"(%1483, %13485, %13486, %13487, %13488, %13489) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13491 = "torch.aten.view"(%13484, %13490) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13491, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13492 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13493 = "torch.aten.floor_divide.Scalar"(%arg64, %13492) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13494 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13495 = "torch.aten.unsqueeze"(%13493, %13494) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13496 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13497 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13498 = "torch.aten.gather"(%arg65, %13496, %13495, %13497) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13499 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13500 = "torch.aten.remainder.Scalar"(%arg64, %13499) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13501 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13502 = "torch.aten.unsqueeze"(%13500, %13501) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13503 = "torch.constant.none"() : () -> !torch.none
    %13504 = "torch.aten.clone"(%1158, %13503) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %13505 = "torch.aten.detach"(%13504) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13506 = "torch.aten.detach"(%13505) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13507 = "torch.aten.detach"(%13506) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13508 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13509 = "torch.aten.unsqueeze"(%13507, %13508) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %13510 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13511 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13512 = "torch.prim.ListConstruct"(%13510, %13511) : (!torch.int, !torch.int) -> !torch.list<int>
    %13513 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13514 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13515 = "torch.prim.ListConstruct"(%13513, %13514) : (!torch.int, !torch.int) -> !torch.list<int>
    %13516 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13518 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %13519 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13520 = "torch.aten.empty_strided"(%13512, %13515, %13516, %13517, %13518, %13519) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13521 = "torch.constant.int"() <{value = 24 : i64}> : () -> !torch.int
    %13522 = "torch.aten.fill.Scalar"(%13520, %13521) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13523 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13524 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13525 = "torch.prim.ListConstruct"(%13523, %13524) : (!torch.int, !torch.int) -> !torch.list<int>
    %13526 = "torch.aten.repeat"(%13509, %13525) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %13527 = "torch.prim.ListConstruct"(%13498, %13522, %13526, %13502) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %13528 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13529 = "torch.aten.index_put"(%13491, %13527, %13388, %13528) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13529, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13530 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %13531 = "torch.prim.ListConstruct"(%1483, %13530) : (!torch.int, !torch.int) -> !torch.list<int>
    %13532 = "torch.aten.view"(%13529, %13531) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13532, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %13533 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13534 = "torch.aten.mul.Scalar"(%arg65, %13533) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13534, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13535 = "torch.constant.int"() <{value = 48 : i64}> : () -> !torch.int
    %13536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13537 = "torch.aten.add.Scalar"(%13534, %13535, %13536) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13537, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13538 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13539 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13540 = "torch.aten.add.Scalar"(%13537, %13538, %13539) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13540, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13541 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %13542 = "torch.aten.view"(%13540, %13541) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%13542, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %13543 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13544 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13545 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13546 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13547 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13548 = "torch.prim.ListConstruct"(%1483, %13543, %13544, %13545, %13546, %13547) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13549 = "torch.aten.view"(%13532, %13548) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13549, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13550 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13551 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13552 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13553 = "torch.prim.ListConstruct"(%1914, %13550, %13551, %13552) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13554 = "torch.aten.view"(%13549, %13553) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13554, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13555 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13556 = "torch.aten.index_select"(%13554, %13555, %13542) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13556, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13557 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13558 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13559 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13560 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13561 = "torch.prim.ListConstruct"(%13557, %1481, %13558, %13559, %13560) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13562 = "torch.aten.view"(%13556, %13561) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13562, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13563 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13564 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13565 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13566 = "torch.prim.ListConstruct"(%13563, %1485, %13564, %13565) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13567 = "torch.aten.view"(%13562, %13566) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13567, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13568 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13569 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13570 = "torch.aten.add.Scalar"(%13537, %13568, %13569) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%13570, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %13571 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %13572 = "torch.aten.view"(%13570, %13571) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%13572, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %13573 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13574 = "torch.aten.index_select"(%13554, %13573, %13572) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13574, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13575 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13576 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13577 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13578 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13579 = "torch.prim.ListConstruct"(%13575, %1481, %13576, %13577, %13578) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13580 = "torch.aten.view"(%13574, %13579) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13580, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13581 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13582 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13583 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13584 = "torch.prim.ListConstruct"(%13581, %1485, %13582, %13583) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13585 = "torch.aten.view"(%13580, %13584) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13585, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13586 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13587 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13588 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13589 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13590 = "torch.aten.slice.Tensor"(%13567, %13586, %13587, %13588, %13589) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13590, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13591 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13592 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13593 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13594 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13595 = "torch.aten.slice.Tensor"(%13585, %13591, %13592, %13593, %13594) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13595, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13596 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %13597 = "torch.aten.unsqueeze"(%13590, %13596) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13597, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13598 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13599 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13600 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13601 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13602 = "torch.prim.ListConstruct"(%13598, %1485, %13599, %13600, %13601) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13603 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13604 = "torch.aten.expand"(%13597, %13602, %13603) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13604, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13605 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13606 = "torch.aten.clone"(%13604, %13605) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13606, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13607 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13608 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13609 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13610 = "torch.prim.ListConstruct"(%13607, %1485, %13608, %13609) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13611 = "torch.aten._unsafe_view"(%13606, %13610) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13611, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13612 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %13613 = "torch.aten.unsqueeze"(%13595, %13612) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13613, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13614 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13615 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13616 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13617 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13618 = "torch.prim.ListConstruct"(%13614, %1485, %13615, %13616, %13617) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13619 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13620 = "torch.aten.expand"(%13613, %13618, %13619) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13620, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13621 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13622 = "torch.aten.clone"(%13620, %13621) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13622, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13623 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13624 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13625 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13626 = "torch.prim.ListConstruct"(%13623, %1485, %13624, %13625) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13627 = "torch.aten._unsafe_view"(%13622, %13626) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13627, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13628 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13629 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13630 = "torch.aten.transpose.int"(%13412, %13628, %13629) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13631 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13632 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13633 = "torch.aten.transpose.int"(%13611, %13631, %13632) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13633, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13634 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13635 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13636 = "torch.aten.transpose.int"(%13627, %13634, %13635) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13636, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13637 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13638 = "torch.aten.squeeze.dim"(%1516, %13637) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13638, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %13639 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13640 = "torch.aten.squeeze.dim"(%13638, %13639) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13640, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %13641 = "torch_c.to_builtin_tensor"(%13630) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %13642 = "tensor.cast"(%13641) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %13643 = "torch_c.to_builtin_tensor"(%13633) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %13644 = "torch_c.to_builtin_tensor"(%13636) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %13645 = "torch_c.to_builtin_tensor"(%13640) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %13646 = "tensor.cast"(%13645) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %13647 = "torch_c.to_builtin_tensor"(%1160) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %13648 = "util.call"(%13642, %13643, %13644, %13647, %13646) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %13649 = "tensor.cast"(%13648) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %13650 = "torch_c.from_builtin_tensor"(%13649) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %13651 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13652 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13653 = "torch.aten.transpose.int"(%13650, %13651, %13652) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %13654 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13655 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13656 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13657 = "torch.prim.ListConstruct"(%13654, %13655, %13656) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13658 = "torch.aten.view"(%13653, %13657) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %13659 = "torch.aten.div.Tensor"(%13658, %1162) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13660 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13661 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13662 = "torch.aten.clamp"(%13659, %13660, %13661) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %13663 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13664 = "torch.prims.convert_element_type"(%13662, %13663) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13665 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13666 = "torch.aten.unsqueeze"(%1164, %13665) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %13667 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13668 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13669 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13670 = "torch.prim.ListConstruct"(%13667, %13668, %13669) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13671 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13672 = "torch.aten.expand"(%13666, %13670, %13671) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %13673 = "torch_c.to_builtin_tensor"(%13664) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13674 = "torch_c.to_builtin_tensor"(%13672) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %13675 = "util.call"(%13673, %13674) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %13676 = "torch_c.from_builtin_tensor"(%13675) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %13677 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13678 = "torch.prims.convert_element_type"(%13676, %13677) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13679 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13680 = "torch.aten.add.Tensor"(%13279, %13678, %13679) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13681 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %13682 = "torch.prims.convert_element_type"(%13680, %13681) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13683 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13684 = "torch.aten.pow.Tensor_Scalar"(%13682, %13683) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13685 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13686 = "torch.prim.ListConstruct"(%13685) : (!torch.int) -> !torch.list<int>
    %13687 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %13688 = "torch.constant.none"() : () -> !torch.none
    %13689 = "torch.aten.mean.dim"(%13684, %13686, %13687, %13688) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %13690 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %13691 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13692 = "torch.aten.add.Scalar"(%13689, %13690, %13691) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %13693 = "torch.aten.rsqrt"(%13692) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %13694 = "torch.aten.mul.Tensor"(%13682, %13693) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13695 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13696 = "torch.prims.convert_element_type"(%13694, %13695) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13697 = "torch.aten.mul.Tensor"(%1166, %13696) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %13698 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13699 = "torch.prims.convert_element_type"(%13697, %13698) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13700 = "torch.aten.div.Tensor"(%13699, %1168) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13701 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13702 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13703 = "torch.aten.clamp"(%13700, %13701, %13702) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13704 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13705 = "torch.prims.convert_element_type"(%13703, %13704) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13706 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13707 = "torch.aten.unsqueeze"(%1170, %13706) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %13708 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13709 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %13710 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13711 = "torch.prim.ListConstruct"(%13708, %13709, %13710) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13712 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13713 = "torch.aten.expand"(%13707, %13711, %13712) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %13714 = "torch_c.to_builtin_tensor"(%13705) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13715 = "torch_c.to_builtin_tensor"(%13713) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %13716 = "util.call"(%13714, %13715) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %13717 = "torch_c.from_builtin_tensor"(%13716) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %13718 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13719 = "torch.prims.convert_element_type"(%13717, %13718) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %13720 = "torch.aten.silu"(%13719) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %13721 = "torch.aten.div.Tensor"(%13699, %1172) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13722 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13723 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13724 = "torch.aten.clamp"(%13721, %13722, %13723) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13725 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13726 = "torch.prims.convert_element_type"(%13724, %13725) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13727 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13728 = "torch.aten.unsqueeze"(%1174, %13727) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %13729 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13730 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %13731 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13732 = "torch.prim.ListConstruct"(%13729, %13730, %13731) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13733 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13734 = "torch.aten.expand"(%13728, %13732, %13733) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %13735 = "torch_c.to_builtin_tensor"(%13726) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13736 = "torch_c.to_builtin_tensor"(%13734) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %13737 = "util.call"(%13735, %13736) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %13738 = "torch_c.from_builtin_tensor"(%13737) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %13739 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13740 = "torch.prims.convert_element_type"(%13738, %13739) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %13741 = "torch.aten.mul.Tensor"(%13720, %13740) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %13742 = "torch.aten.div.Tensor"(%13741, %1176) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %13743 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13744 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13745 = "torch.aten.clamp"(%13742, %13743, %13744) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %13746 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13747 = "torch.prims.convert_element_type"(%13745, %13746) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %13748 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13749 = "torch.aten.unsqueeze"(%1178, %13748) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %13750 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13751 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13752 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %13753 = "torch.prim.ListConstruct"(%13750, %13751, %13752) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13754 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13755 = "torch.aten.expand"(%13749, %13753, %13754) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %13756 = "torch_c.to_builtin_tensor"(%13747) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %13757 = "torch_c.to_builtin_tensor"(%13755) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %13758 = "util.call"(%13756, %13757) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %13759 = "torch_c.from_builtin_tensor"(%13758) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %13760 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13761 = "torch.prims.convert_element_type"(%13759, %13760) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13763 = "torch.aten.add.Tensor"(%13680, %13761, %13762) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13764 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %13765 = "torch.prims.convert_element_type"(%13763, %13764) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13766 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13767 = "torch.aten.pow.Tensor_Scalar"(%13765, %13766) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %13768 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13769 = "torch.prim.ListConstruct"(%13768) : (!torch.int) -> !torch.list<int>
    %13770 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %13771 = "torch.constant.none"() : () -> !torch.none
    %13772 = "torch.aten.mean.dim"(%13767, %13769, %13770, %13771) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %13773 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %13774 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13775 = "torch.aten.add.Scalar"(%13772, %13773, %13774) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %13776 = "torch.aten.rsqrt"(%13775) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %13777 = "torch.aten.mul.Tensor"(%13765, %13776) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13778 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13779 = "torch.prims.convert_element_type"(%13777, %13778) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13780 = "torch.aten.mul.Tensor"(%1180, %13779) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %13781 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %13782 = "torch.prims.convert_element_type"(%13780, %13781) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %13783 = "torch.aten.div.Tensor"(%13782, %1182) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13784 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13785 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13786 = "torch.aten.clamp"(%13783, %13784, %13785) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13787 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13788 = "torch.prims.convert_element_type"(%13786, %13787) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13789 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13790 = "torch.aten.unsqueeze"(%1184, %13789) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %13791 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13792 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13793 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13794 = "torch.prim.ListConstruct"(%13791, %13792, %13793) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13795 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13796 = "torch.aten.expand"(%13790, %13794, %13795) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %13797 = "torch_c.to_builtin_tensor"(%13788) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13798 = "torch_c.to_builtin_tensor"(%13796) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %13799 = "util.call"(%13797, %13798) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %13800 = "torch_c.from_builtin_tensor"(%13799) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %13801 = "torch.aten.div.Tensor"(%13800, %1186) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %13802 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13803 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13804 = "torch.aten.clamp"(%13801, %13802, %13803) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %13805 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13806 = "torch.prims.convert_element_type"(%13804, %13805) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13807 = "torch.aten.div.Tensor"(%13782, %1188) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13808 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13809 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13810 = "torch.aten.clamp"(%13807, %13808, %13809) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13811 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13812 = "torch.prims.convert_element_type"(%13810, %13811) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13813 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13814 = "torch.aten.unsqueeze"(%1190, %13813) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %13815 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13816 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %13817 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13818 = "torch.prim.ListConstruct"(%13815, %13816, %13817) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13819 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13820 = "torch.aten.expand"(%13814, %13818, %13819) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %13821 = "torch_c.to_builtin_tensor"(%13812) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13822 = "torch_c.to_builtin_tensor"(%13820) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %13823 = "util.call"(%13821, %13822) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %13824 = "torch_c.from_builtin_tensor"(%13823) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %13825 = "torch.aten.div.Tensor"(%13824, %1192) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %13826 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13827 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13828 = "torch.aten.clamp"(%13825, %13826, %13827) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %13829 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13830 = "torch.prims.convert_element_type"(%13828, %13829) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %13831 = "torch.aten.div.Tensor"(%13782, %1194) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %13832 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13833 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13834 = "torch.aten.clamp"(%13831, %13832, %13833) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %13835 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13836 = "torch.prims.convert_element_type"(%13834, %13835) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %13837 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13838 = "torch.aten.unsqueeze"(%1196, %13837) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %13839 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13840 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %13841 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %13842 = "torch.prim.ListConstruct"(%13839, %13840, %13841) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13843 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13844 = "torch.aten.expand"(%13838, %13842, %13843) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %13845 = "torch_c.to_builtin_tensor"(%13836) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %13846 = "torch_c.to_builtin_tensor"(%13844) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %13847 = "util.call"(%13845, %13846) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %13848 = "torch_c.from_builtin_tensor"(%13847) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %13849 = "torch.aten.div.Tensor"(%13848, %1198) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %13850 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %13851 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %13852 = "torch.aten.clamp"(%13849, %13850, %13851) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %13853 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %13854 = "torch.prims.convert_element_type"(%13852, %13853) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %13855 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13857 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13858 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13859 = "torch.prim.ListConstruct"(%13855, %13856, %13857, %13858) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13860 = "torch.aten.view"(%13806, %13859) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %13861 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13862 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13863 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13864 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13865 = "torch.prim.ListConstruct"(%13861, %13862, %13863, %13864) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13866 = "torch.aten.view"(%13830, %13865) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %13867 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13868 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13869 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13870 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13871 = "torch.prim.ListConstruct"(%13867, %13868, %13869, %13870) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13872 = "torch.aten.view"(%13854, %13871) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %13873 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13874 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13875 = "torch.aten.transpose.int"(%13860, %13873, %13874) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13876 = "torch.aten.mul.Tensor"(%13875, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13877 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13878 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13879 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13880 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13881 = "torch.aten.slice.Tensor"(%13875, %13877, %13878, %13879, %13880) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %13882 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13883 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13884 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13885 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13886 = "torch.aten.slice.Tensor"(%13875, %13882, %13883, %13884, %13885) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %13887 = "torch.aten.neg"(%13886) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %13888 = "torch.prim.ListConstruct"(%13887, %13881) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %13889 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13890 = "torch.aten.cat"(%13888, %13889) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13891 = "torch.aten.mul.Tensor"(%13890, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13892 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13893 = "torch.aten.add.Tensor"(%13876, %13891, %13892) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %13894 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13895 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13896 = "torch.aten.transpose.int"(%13893, %13894, %13895) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %13897 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13898 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13899 = "torch.aten.transpose.int"(%13866, %13897, %13898) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13900 = "torch.aten.mul.Tensor"(%13899, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13901 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13902 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13903 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13904 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13905 = "torch.aten.slice.Tensor"(%13899, %13901, %13902, %13903, %13904) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %13906 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %13907 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %13908 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %13909 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13910 = "torch.aten.slice.Tensor"(%13899, %13906, %13907, %13908, %13909) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %13911 = "torch.aten.neg"(%13910) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %13912 = "torch.prim.ListConstruct"(%13911, %13905) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %13913 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %13914 = "torch.aten.cat"(%13912, %13913) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13915 = "torch.aten.mul.Tensor"(%13914, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13916 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13917 = "torch.aten.add.Tensor"(%13900, %13915, %13916) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %13918 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13919 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13920 = "torch.aten.transpose.int"(%13917, %13918, %13919) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %13921 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13922 = "torch.aten.floor_divide.Scalar"(%arg64, %13921) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13923 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13924 = "torch.aten.unsqueeze"(%13922, %13923) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13925 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13926 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13927 = "torch.aten.gather"(%arg65, %13925, %13924, %13926) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13928 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13929 = "torch.aten.remainder.Scalar"(%arg64, %13928) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13930 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13931 = "torch.aten.unsqueeze"(%13929, %13930) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13932 = "torch.constant.none"() : () -> !torch.none
    %13933 = "torch.aten.clone"(%1199, %13932) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %13934 = "torch.aten.detach"(%13933) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13935 = "torch.aten.detach"(%13934) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13936 = "torch.aten.detach"(%13935) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13937 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13938 = "torch.aten.unsqueeze"(%13936, %13937) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %13939 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13940 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13941 = "torch.prim.ListConstruct"(%13939, %13940) : (!torch.int, !torch.int) -> !torch.list<int>
    %13942 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13943 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13944 = "torch.prim.ListConstruct"(%13942, %13943) : (!torch.int, !torch.int) -> !torch.list<int>
    %13945 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13946 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13947 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %13948 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13949 = "torch.aten.empty_strided"(%13941, %13944, %13945, %13946, %13947, %13948) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13950 = "torch.constant.int"() <{value = 25 : i64}> : () -> !torch.int
    %13951 = "torch.aten.fill.Scalar"(%13949, %13950) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13952 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13953 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13954 = "torch.prim.ListConstruct"(%13952, %13953) : (!torch.int, !torch.int) -> !torch.list<int>
    %13955 = "torch.aten.repeat"(%13938, %13954) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %13956 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13957 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13958 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13959 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13960 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13961 = "torch.prim.ListConstruct"(%1483, %13956, %13957, %13958, %13959, %13960) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13962 = "torch.aten.view"(%13532, %13961) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13962, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13963 = "torch.prim.ListConstruct"(%13927, %13951, %13955, %13931) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %13964 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13965 = "torch.aten.index_put"(%13962, %13963, %13920, %13964) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13965, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13966 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %13967 = "torch.prim.ListConstruct"(%1483, %13966) : (!torch.int, !torch.int) -> !torch.list<int>
    %13968 = "torch.aten.view"(%13965, %13967) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13968, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %13969 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13970 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %13971 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13972 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %13973 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %13974 = "torch.prim.ListConstruct"(%1483, %13969, %13970, %13971, %13972, %13973) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %13975 = "torch.aten.view"(%13968, %13974) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%13975, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %13976 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13977 = "torch.aten.floor_divide.Scalar"(%arg64, %13976) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13978 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13979 = "torch.aten.unsqueeze"(%13977, %13978) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13980 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13981 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %13982 = "torch.aten.gather"(%arg65, %13980, %13979, %13981) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %13983 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %13984 = "torch.aten.remainder.Scalar"(%arg64, %13983) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %13985 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13986 = "torch.aten.unsqueeze"(%13984, %13985) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %13987 = "torch.constant.none"() : () -> !torch.none
    %13988 = "torch.aten.clone"(%1200, %13987) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %13989 = "torch.aten.detach"(%13988) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13990 = "torch.aten.detach"(%13989) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13991 = "torch.aten.detach"(%13990) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %13992 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %13993 = "torch.aten.unsqueeze"(%13991, %13992) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %13994 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %13995 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13996 = "torch.prim.ListConstruct"(%13994, %13995) : (!torch.int, !torch.int) -> !torch.list<int>
    %13997 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13998 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %13999 = "torch.prim.ListConstruct"(%13997, %13998) : (!torch.int, !torch.int) -> !torch.list<int>
    %14000 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14001 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14002 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %14003 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14004 = "torch.aten.empty_strided"(%13996, %13999, %14000, %14001, %14002, %14003) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14005 = "torch.constant.int"() <{value = 25 : i64}> : () -> !torch.int
    %14006 = "torch.aten.fill.Scalar"(%14004, %14005) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14007 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14008 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14009 = "torch.prim.ListConstruct"(%14007, %14008) : (!torch.int, !torch.int) -> !torch.list<int>
    %14010 = "torch.aten.repeat"(%13993, %14009) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %14011 = "torch.prim.ListConstruct"(%13982, %14006, %14010, %13986) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %14012 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14013 = "torch.aten.index_put"(%13975, %14011, %13872, %14012) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14013, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14014 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %14015 = "torch.prim.ListConstruct"(%1483, %14014) : (!torch.int, !torch.int) -> !torch.list<int>
    %14016 = "torch.aten.view"(%14013, %14015) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14016, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %14017 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14018 = "torch.aten.mul.Scalar"(%arg65, %14017) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14018, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14019 = "torch.constant.int"() <{value = 50 : i64}> : () -> !torch.int
    %14020 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14021 = "torch.aten.add.Scalar"(%14018, %14019, %14020) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14021, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14022 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14023 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14024 = "torch.aten.add.Scalar"(%14021, %14022, %14023) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14024, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14025 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %14026 = "torch.aten.view"(%14024, %14025) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%14026, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %14027 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14028 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14029 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14030 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14031 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14032 = "torch.prim.ListConstruct"(%1483, %14027, %14028, %14029, %14030, %14031) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14033 = "torch.aten.view"(%14016, %14032) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14033, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14034 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14035 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14036 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14037 = "torch.prim.ListConstruct"(%1914, %14034, %14035, %14036) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14038 = "torch.aten.view"(%14033, %14037) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14038, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14039 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14040 = "torch.aten.index_select"(%14038, %14039, %14026) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14040, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14041 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14042 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14043 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14044 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14045 = "torch.prim.ListConstruct"(%14041, %1481, %14042, %14043, %14044) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14046 = "torch.aten.view"(%14040, %14045) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14046, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14047 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14048 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14049 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14050 = "torch.prim.ListConstruct"(%14047, %1485, %14048, %14049) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14051 = "torch.aten.view"(%14046, %14050) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14051, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14052 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14053 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14054 = "torch.aten.add.Scalar"(%14021, %14052, %14053) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14054, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14055 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %14056 = "torch.aten.view"(%14054, %14055) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%14056, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %14057 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14058 = "torch.aten.index_select"(%14038, %14057, %14056) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14058, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14059 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14060 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14061 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14062 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14063 = "torch.prim.ListConstruct"(%14059, %1481, %14060, %14061, %14062) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14064 = "torch.aten.view"(%14058, %14063) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14064, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14065 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14066 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14067 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14068 = "torch.prim.ListConstruct"(%14065, %1485, %14066, %14067) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14069 = "torch.aten.view"(%14064, %14068) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14069, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14070 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14071 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14072 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14073 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14074 = "torch.aten.slice.Tensor"(%14051, %14070, %14071, %14072, %14073) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14074, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14075 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14076 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14077 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14078 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14079 = "torch.aten.slice.Tensor"(%14069, %14075, %14076, %14077, %14078) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14079, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14080 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %14081 = "torch.aten.unsqueeze"(%14074, %14080) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14081, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14082 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14083 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14084 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14085 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14086 = "torch.prim.ListConstruct"(%14082, %1485, %14083, %14084, %14085) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14087 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14088 = "torch.aten.expand"(%14081, %14086, %14087) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14088, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14089 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14090 = "torch.aten.clone"(%14088, %14089) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14090, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14091 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14092 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14093 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14094 = "torch.prim.ListConstruct"(%14091, %1485, %14092, %14093) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14095 = "torch.aten._unsafe_view"(%14090, %14094) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14095, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14096 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %14097 = "torch.aten.unsqueeze"(%14079, %14096) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14097, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14098 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14099 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14100 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14101 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14102 = "torch.prim.ListConstruct"(%14098, %1485, %14099, %14100, %14101) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14103 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14104 = "torch.aten.expand"(%14097, %14102, %14103) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14104, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14105 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14106 = "torch.aten.clone"(%14104, %14105) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14106, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14107 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14108 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14109 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14110 = "torch.prim.ListConstruct"(%14107, %1485, %14108, %14109) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14111 = "torch.aten._unsafe_view"(%14106, %14110) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14111, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14112 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14113 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14114 = "torch.aten.transpose.int"(%13896, %14112, %14113) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14115 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14116 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14117 = "torch.aten.transpose.int"(%14095, %14115, %14116) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14117, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14118 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14119 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14120 = "torch.aten.transpose.int"(%14111, %14118, %14119) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14120, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14121 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14122 = "torch.aten.squeeze.dim"(%1516, %14121) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14122, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %14123 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14124 = "torch.aten.squeeze.dim"(%14122, %14123) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14124, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %14125 = "torch_c.to_builtin_tensor"(%14114) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %14126 = "tensor.cast"(%14125) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %14127 = "torch_c.to_builtin_tensor"(%14117) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %14128 = "torch_c.to_builtin_tensor"(%14120) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %14129 = "torch_c.to_builtin_tensor"(%14124) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %14130 = "tensor.cast"(%14129) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %14131 = "torch_c.to_builtin_tensor"(%1202) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %14132 = "util.call"(%14126, %14127, %14128, %14131, %14130) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %14133 = "tensor.cast"(%14132) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %14134 = "torch_c.from_builtin_tensor"(%14133) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %14135 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14136 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14137 = "torch.aten.transpose.int"(%14134, %14135, %14136) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %14138 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14139 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14140 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14141 = "torch.prim.ListConstruct"(%14138, %14139, %14140) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14142 = "torch.aten.view"(%14137, %14141) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %14143 = "torch.aten.div.Tensor"(%14142, %1204) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14144 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14145 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14146 = "torch.aten.clamp"(%14143, %14144, %14145) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %14147 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14148 = "torch.prims.convert_element_type"(%14146, %14147) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14149 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14150 = "torch.aten.unsqueeze"(%1206, %14149) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %14151 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14152 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14153 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14154 = "torch.prim.ListConstruct"(%14151, %14152, %14153) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14155 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14156 = "torch.aten.expand"(%14150, %14154, %14155) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %14157 = "torch_c.to_builtin_tensor"(%14148) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14158 = "torch_c.to_builtin_tensor"(%14156) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %14159 = "util.call"(%14157, %14158) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %14160 = "torch_c.from_builtin_tensor"(%14159) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %14161 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14162 = "torch.prims.convert_element_type"(%14160, %14161) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14163 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14164 = "torch.aten.add.Tensor"(%13763, %14162, %14163) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14165 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %14166 = "torch.prims.convert_element_type"(%14164, %14165) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14167 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14168 = "torch.aten.pow.Tensor_Scalar"(%14166, %14167) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14169 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14170 = "torch.prim.ListConstruct"(%14169) : (!torch.int) -> !torch.list<int>
    %14171 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %14172 = "torch.constant.none"() : () -> !torch.none
    %14173 = "torch.aten.mean.dim"(%14168, %14170, %14171, %14172) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %14174 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %14175 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14176 = "torch.aten.add.Scalar"(%14173, %14174, %14175) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %14177 = "torch.aten.rsqrt"(%14176) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %14178 = "torch.aten.mul.Tensor"(%14166, %14177) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14179 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14180 = "torch.prims.convert_element_type"(%14178, %14179) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14181 = "torch.aten.mul.Tensor"(%1208, %14180) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %14182 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14183 = "torch.prims.convert_element_type"(%14181, %14182) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14184 = "torch.aten.div.Tensor"(%14183, %1210) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14185 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14186 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14187 = "torch.aten.clamp"(%14184, %14185, %14186) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14188 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14189 = "torch.prims.convert_element_type"(%14187, %14188) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14190 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14191 = "torch.aten.unsqueeze"(%1212, %14190) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %14192 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14193 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %14194 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14195 = "torch.prim.ListConstruct"(%14192, %14193, %14194) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14196 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14197 = "torch.aten.expand"(%14191, %14195, %14196) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %14198 = "torch_c.to_builtin_tensor"(%14189) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14199 = "torch_c.to_builtin_tensor"(%14197) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %14200 = "util.call"(%14198, %14199) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %14201 = "torch_c.from_builtin_tensor"(%14200) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %14202 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14203 = "torch.prims.convert_element_type"(%14201, %14202) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %14204 = "torch.aten.silu"(%14203) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %14205 = "torch.aten.div.Tensor"(%14183, %1214) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14206 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14207 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14208 = "torch.aten.clamp"(%14205, %14206, %14207) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14209 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14210 = "torch.prims.convert_element_type"(%14208, %14209) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14211 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14212 = "torch.aten.unsqueeze"(%1216, %14211) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %14213 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14214 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %14215 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14216 = "torch.prim.ListConstruct"(%14213, %14214, %14215) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14217 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14218 = "torch.aten.expand"(%14212, %14216, %14217) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %14219 = "torch_c.to_builtin_tensor"(%14210) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14220 = "torch_c.to_builtin_tensor"(%14218) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %14221 = "util.call"(%14219, %14220) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %14222 = "torch_c.from_builtin_tensor"(%14221) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %14223 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14224 = "torch.prims.convert_element_type"(%14222, %14223) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %14225 = "torch.aten.mul.Tensor"(%14204, %14224) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %14226 = "torch.aten.div.Tensor"(%14225, %1218) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %14227 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14228 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14229 = "torch.aten.clamp"(%14226, %14227, %14228) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %14230 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14231 = "torch.prims.convert_element_type"(%14229, %14230) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %14232 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14233 = "torch.aten.unsqueeze"(%1220, %14232) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %14234 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14235 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14236 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %14237 = "torch.prim.ListConstruct"(%14234, %14235, %14236) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14238 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14239 = "torch.aten.expand"(%14233, %14237, %14238) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %14240 = "torch_c.to_builtin_tensor"(%14231) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %14241 = "torch_c.to_builtin_tensor"(%14239) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %14242 = "util.call"(%14240, %14241) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %14243 = "torch_c.from_builtin_tensor"(%14242) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %14244 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14245 = "torch.prims.convert_element_type"(%14243, %14244) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14246 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14247 = "torch.aten.add.Tensor"(%14164, %14245, %14246) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14248 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %14249 = "torch.prims.convert_element_type"(%14247, %14248) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14250 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14251 = "torch.aten.pow.Tensor_Scalar"(%14249, %14250) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14252 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14253 = "torch.prim.ListConstruct"(%14252) : (!torch.int) -> !torch.list<int>
    %14254 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %14255 = "torch.constant.none"() : () -> !torch.none
    %14256 = "torch.aten.mean.dim"(%14251, %14253, %14254, %14255) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %14257 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %14258 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14259 = "torch.aten.add.Scalar"(%14256, %14257, %14258) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %14260 = "torch.aten.rsqrt"(%14259) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %14261 = "torch.aten.mul.Tensor"(%14249, %14260) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14262 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14263 = "torch.prims.convert_element_type"(%14261, %14262) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14264 = "torch.aten.mul.Tensor"(%1222, %14263) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %14265 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14266 = "torch.prims.convert_element_type"(%14264, %14265) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14267 = "torch.aten.div.Tensor"(%14266, %1224) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14268 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14269 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14270 = "torch.aten.clamp"(%14267, %14268, %14269) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14271 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14272 = "torch.prims.convert_element_type"(%14270, %14271) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14273 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14274 = "torch.aten.unsqueeze"(%1226, %14273) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %14275 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14276 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14277 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14278 = "torch.prim.ListConstruct"(%14275, %14276, %14277) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14279 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14280 = "torch.aten.expand"(%14274, %14278, %14279) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %14281 = "torch_c.to_builtin_tensor"(%14272) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14282 = "torch_c.to_builtin_tensor"(%14280) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %14283 = "util.call"(%14281, %14282) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %14284 = "torch_c.from_builtin_tensor"(%14283) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %14285 = "torch.aten.div.Tensor"(%14284, %1228) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14286 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14287 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14288 = "torch.aten.clamp"(%14285, %14286, %14287) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %14289 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14290 = "torch.prims.convert_element_type"(%14288, %14289) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14291 = "torch.aten.div.Tensor"(%14266, %1230) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14292 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14293 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14294 = "torch.aten.clamp"(%14291, %14292, %14293) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14295 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14296 = "torch.prims.convert_element_type"(%14294, %14295) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14297 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14298 = "torch.aten.unsqueeze"(%1232, %14297) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %14299 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14300 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %14301 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14302 = "torch.prim.ListConstruct"(%14299, %14300, %14301) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14303 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14304 = "torch.aten.expand"(%14298, %14302, %14303) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %14305 = "torch_c.to_builtin_tensor"(%14296) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14306 = "torch_c.to_builtin_tensor"(%14304) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %14307 = "util.call"(%14305, %14306) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %14308 = "torch_c.from_builtin_tensor"(%14307) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %14309 = "torch.aten.div.Tensor"(%14308, %1234) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %14310 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14311 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14312 = "torch.aten.clamp"(%14309, %14310, %14311) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %14313 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14314 = "torch.prims.convert_element_type"(%14312, %14313) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %14315 = "torch.aten.div.Tensor"(%14266, %1236) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14316 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14317 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14318 = "torch.aten.clamp"(%14315, %14316, %14317) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14319 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14320 = "torch.prims.convert_element_type"(%14318, %14319) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14321 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14322 = "torch.aten.unsqueeze"(%1238, %14321) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %14323 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14324 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %14325 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14326 = "torch.prim.ListConstruct"(%14323, %14324, %14325) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14327 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14328 = "torch.aten.expand"(%14322, %14326, %14327) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %14329 = "torch_c.to_builtin_tensor"(%14320) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14330 = "torch_c.to_builtin_tensor"(%14328) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %14331 = "util.call"(%14329, %14330) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %14332 = "torch_c.from_builtin_tensor"(%14331) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %14333 = "torch.aten.div.Tensor"(%14332, %1240) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %14334 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14335 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14336 = "torch.aten.clamp"(%14333, %14334, %14335) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %14337 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14338 = "torch.prims.convert_element_type"(%14336, %14337) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %14339 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14340 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14341 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14342 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14343 = "torch.prim.ListConstruct"(%14339, %14340, %14341, %14342) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14344 = "torch.aten.view"(%14290, %14343) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %14345 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14346 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14347 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14348 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14349 = "torch.prim.ListConstruct"(%14345, %14346, %14347, %14348) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14350 = "torch.aten.view"(%14314, %14349) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %14351 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14352 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14353 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14354 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14355 = "torch.prim.ListConstruct"(%14351, %14352, %14353, %14354) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14356 = "torch.aten.view"(%14338, %14355) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %14357 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14358 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14359 = "torch.aten.transpose.int"(%14344, %14357, %14358) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14360 = "torch.aten.mul.Tensor"(%14359, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14361 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14362 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14363 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14364 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14365 = "torch.aten.slice.Tensor"(%14359, %14361, %14362, %14363, %14364) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %14366 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14367 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14368 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14369 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14370 = "torch.aten.slice.Tensor"(%14359, %14366, %14367, %14368, %14369) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %14371 = "torch.aten.neg"(%14370) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %14372 = "torch.prim.ListConstruct"(%14371, %14365) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %14373 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14374 = "torch.aten.cat"(%14372, %14373) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14375 = "torch.aten.mul.Tensor"(%14374, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14376 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14377 = "torch.aten.add.Tensor"(%14360, %14375, %14376) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14378 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14379 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14380 = "torch.aten.transpose.int"(%14377, %14378, %14379) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %14381 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14382 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14383 = "torch.aten.transpose.int"(%14350, %14381, %14382) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14384 = "torch.aten.mul.Tensor"(%14383, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14385 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14386 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14387 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14388 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14389 = "torch.aten.slice.Tensor"(%14383, %14385, %14386, %14387, %14388) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %14390 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14391 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14392 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14393 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14394 = "torch.aten.slice.Tensor"(%14383, %14390, %14391, %14392, %14393) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %14395 = "torch.aten.neg"(%14394) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %14396 = "torch.prim.ListConstruct"(%14395, %14389) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %14397 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14398 = "torch.aten.cat"(%14396, %14397) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14399 = "torch.aten.mul.Tensor"(%14398, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14400 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14401 = "torch.aten.add.Tensor"(%14384, %14399, %14400) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14402 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14403 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14404 = "torch.aten.transpose.int"(%14401, %14402, %14403) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %14405 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14406 = "torch.aten.floor_divide.Scalar"(%arg64, %14405) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14408 = "torch.aten.unsqueeze"(%14406, %14407) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14409 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14410 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14411 = "torch.aten.gather"(%arg65, %14409, %14408, %14410) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14412 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14413 = "torch.aten.remainder.Scalar"(%arg64, %14412) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14414 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14415 = "torch.aten.unsqueeze"(%14413, %14414) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14416 = "torch.constant.none"() : () -> !torch.none
    %14417 = "torch.aten.clone"(%1241, %14416) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %14418 = "torch.aten.detach"(%14417) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14419 = "torch.aten.detach"(%14418) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14420 = "torch.aten.detach"(%14419) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14421 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14422 = "torch.aten.unsqueeze"(%14420, %14421) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %14423 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14424 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14425 = "torch.prim.ListConstruct"(%14423, %14424) : (!torch.int, !torch.int) -> !torch.list<int>
    %14426 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14427 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14428 = "torch.prim.ListConstruct"(%14426, %14427) : (!torch.int, !torch.int) -> !torch.list<int>
    %14429 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14430 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14431 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %14432 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14433 = "torch.aten.empty_strided"(%14425, %14428, %14429, %14430, %14431, %14432) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14434 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14435 = "torch.aten.fill.Scalar"(%14433, %14434) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14436 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14437 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14438 = "torch.prim.ListConstruct"(%14436, %14437) : (!torch.int, !torch.int) -> !torch.list<int>
    %14439 = "torch.aten.repeat"(%14422, %14438) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %14440 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14441 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14442 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14443 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14444 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14445 = "torch.prim.ListConstruct"(%1483, %14440, %14441, %14442, %14443, %14444) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14446 = "torch.aten.view"(%14016, %14445) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14446, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14447 = "torch.prim.ListConstruct"(%14411, %14435, %14439, %14415) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %14448 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14449 = "torch.aten.index_put"(%14446, %14447, %14404, %14448) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14449, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14450 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %14451 = "torch.prim.ListConstruct"(%1483, %14450) : (!torch.int, !torch.int) -> !torch.list<int>
    %14452 = "torch.aten.view"(%14449, %14451) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14452, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %14453 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14454 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14455 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14456 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14457 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14458 = "torch.prim.ListConstruct"(%1483, %14453, %14454, %14455, %14456, %14457) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14459 = "torch.aten.view"(%14452, %14458) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14459, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14460 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14461 = "torch.aten.floor_divide.Scalar"(%arg64, %14460) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14462 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14463 = "torch.aten.unsqueeze"(%14461, %14462) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14464 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14465 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14466 = "torch.aten.gather"(%arg65, %14464, %14463, %14465) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14467 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14468 = "torch.aten.remainder.Scalar"(%arg64, %14467) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14469 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14470 = "torch.aten.unsqueeze"(%14468, %14469) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14471 = "torch.constant.none"() : () -> !torch.none
    %14472 = "torch.aten.clone"(%1242, %14471) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %14473 = "torch.aten.detach"(%14472) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14474 = "torch.aten.detach"(%14473) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14475 = "torch.aten.detach"(%14474) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14476 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14477 = "torch.aten.unsqueeze"(%14475, %14476) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %14478 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14479 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14480 = "torch.prim.ListConstruct"(%14478, %14479) : (!torch.int, !torch.int) -> !torch.list<int>
    %14481 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14482 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14483 = "torch.prim.ListConstruct"(%14481, %14482) : (!torch.int, !torch.int) -> !torch.list<int>
    %14484 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14485 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14486 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %14487 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14488 = "torch.aten.empty_strided"(%14480, %14483, %14484, %14485, %14486, %14487) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14489 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14490 = "torch.aten.fill.Scalar"(%14488, %14489) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14491 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14492 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14493 = "torch.prim.ListConstruct"(%14491, %14492) : (!torch.int, !torch.int) -> !torch.list<int>
    %14494 = "torch.aten.repeat"(%14477, %14493) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %14495 = "torch.prim.ListConstruct"(%14466, %14490, %14494, %14470) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %14496 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14497 = "torch.aten.index_put"(%14459, %14495, %14356, %14496) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14497, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14498 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %14499 = "torch.prim.ListConstruct"(%1483, %14498) : (!torch.int, !torch.int) -> !torch.list<int>
    %14500 = "torch.aten.view"(%14497, %14499) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14500, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %14501 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14502 = "torch.aten.mul.Scalar"(%arg65, %14501) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14502, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14503 = "torch.constant.int"() <{value = 52 : i64}> : () -> !torch.int
    %14504 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14505 = "torch.aten.add.Scalar"(%14502, %14503, %14504) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14505, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14506 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14507 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14508 = "torch.aten.add.Scalar"(%14505, %14506, %14507) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14508, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14509 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %14510 = "torch.aten.view"(%14508, %14509) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%14510, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %14511 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14512 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14513 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14514 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14515 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14516 = "torch.prim.ListConstruct"(%1483, %14511, %14512, %14513, %14514, %14515) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14517 = "torch.aten.view"(%14500, %14516) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14517, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14518 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14519 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14520 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14521 = "torch.prim.ListConstruct"(%1914, %14518, %14519, %14520) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14522 = "torch.aten.view"(%14517, %14521) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14522, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14523 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14524 = "torch.aten.index_select"(%14522, %14523, %14510) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14524, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14525 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14526 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14527 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14528 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14529 = "torch.prim.ListConstruct"(%14525, %1481, %14526, %14527, %14528) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14530 = "torch.aten.view"(%14524, %14529) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14530, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14531 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14532 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14533 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14534 = "torch.prim.ListConstruct"(%14531, %1485, %14532, %14533) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14535 = "torch.aten.view"(%14530, %14534) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14535, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14537 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14538 = "torch.aten.add.Scalar"(%14505, %14536, %14537) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14538, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14539 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %14540 = "torch.aten.view"(%14538, %14539) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%14540, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %14541 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14542 = "torch.aten.index_select"(%14522, %14541, %14540) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14542, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14543 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14544 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14545 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14546 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14547 = "torch.prim.ListConstruct"(%14543, %1481, %14544, %14545, %14546) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14548 = "torch.aten.view"(%14542, %14547) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14548, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14549 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14550 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14551 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14552 = "torch.prim.ListConstruct"(%14549, %1485, %14550, %14551) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14553 = "torch.aten.view"(%14548, %14552) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14553, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14554 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14555 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14556 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14557 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14558 = "torch.aten.slice.Tensor"(%14535, %14554, %14555, %14556, %14557) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14558, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14559 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14560 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14561 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14562 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14563 = "torch.aten.slice.Tensor"(%14553, %14559, %14560, %14561, %14562) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14563, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14564 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %14565 = "torch.aten.unsqueeze"(%14558, %14564) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14565, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14566 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14567 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14568 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14569 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14570 = "torch.prim.ListConstruct"(%14566, %1485, %14567, %14568, %14569) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14571 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14572 = "torch.aten.expand"(%14565, %14570, %14571) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14572, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14573 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14574 = "torch.aten.clone"(%14572, %14573) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14574, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14575 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14576 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14577 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14578 = "torch.prim.ListConstruct"(%14575, %1485, %14576, %14577) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14579 = "torch.aten._unsafe_view"(%14574, %14578) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14579, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14580 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %14581 = "torch.aten.unsqueeze"(%14563, %14580) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14581, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14582 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14583 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14584 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14585 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14586 = "torch.prim.ListConstruct"(%14582, %1485, %14583, %14584, %14585) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14587 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14588 = "torch.aten.expand"(%14581, %14586, %14587) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14588, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14589 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14590 = "torch.aten.clone"(%14588, %14589) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14590, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14591 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14592 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14593 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14594 = "torch.prim.ListConstruct"(%14591, %1485, %14592, %14593) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14595 = "torch.aten._unsafe_view"(%14590, %14594) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14595, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14596 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14597 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14598 = "torch.aten.transpose.int"(%14380, %14596, %14597) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14599 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14600 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14601 = "torch.aten.transpose.int"(%14579, %14599, %14600) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14601, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14602 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14603 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14604 = "torch.aten.transpose.int"(%14595, %14602, %14603) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14604, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14605 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14606 = "torch.aten.squeeze.dim"(%1516, %14605) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14606, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %14607 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14608 = "torch.aten.squeeze.dim"(%14606, %14607) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14608, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %14609 = "torch_c.to_builtin_tensor"(%14598) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %14610 = "tensor.cast"(%14609) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %14611 = "torch_c.to_builtin_tensor"(%14601) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %14612 = "torch_c.to_builtin_tensor"(%14604) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %14613 = "torch_c.to_builtin_tensor"(%14608) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %14614 = "tensor.cast"(%14613) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %14615 = "torch_c.to_builtin_tensor"(%1244) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %14616 = "util.call"(%14610, %14611, %14612, %14615, %14614) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %14617 = "tensor.cast"(%14616) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %14618 = "torch_c.from_builtin_tensor"(%14617) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %14619 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14620 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14621 = "torch.aten.transpose.int"(%14618, %14619, %14620) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %14622 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14623 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14624 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14625 = "torch.prim.ListConstruct"(%14622, %14623, %14624) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14626 = "torch.aten.view"(%14621, %14625) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %14627 = "torch.aten.div.Tensor"(%14626, %1246) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14628 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14629 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14630 = "torch.aten.clamp"(%14627, %14628, %14629) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %14631 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14632 = "torch.prims.convert_element_type"(%14630, %14631) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14633 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14634 = "torch.aten.unsqueeze"(%1248, %14633) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %14635 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14636 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14637 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14638 = "torch.prim.ListConstruct"(%14635, %14636, %14637) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14639 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14640 = "torch.aten.expand"(%14634, %14638, %14639) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %14641 = "torch_c.to_builtin_tensor"(%14632) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14642 = "torch_c.to_builtin_tensor"(%14640) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %14643 = "util.call"(%14641, %14642) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %14644 = "torch_c.from_builtin_tensor"(%14643) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %14645 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14646 = "torch.prims.convert_element_type"(%14644, %14645) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14647 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14648 = "torch.aten.add.Tensor"(%14247, %14646, %14647) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14649 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %14650 = "torch.prims.convert_element_type"(%14648, %14649) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14651 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14652 = "torch.aten.pow.Tensor_Scalar"(%14650, %14651) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14653 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14654 = "torch.prim.ListConstruct"(%14653) : (!torch.int) -> !torch.list<int>
    %14655 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %14656 = "torch.constant.none"() : () -> !torch.none
    %14657 = "torch.aten.mean.dim"(%14652, %14654, %14655, %14656) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %14658 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %14659 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14660 = "torch.aten.add.Scalar"(%14657, %14658, %14659) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %14661 = "torch.aten.rsqrt"(%14660) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %14662 = "torch.aten.mul.Tensor"(%14650, %14661) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14663 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14664 = "torch.prims.convert_element_type"(%14662, %14663) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14665 = "torch.aten.mul.Tensor"(%1250, %14664) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %14666 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14667 = "torch.prims.convert_element_type"(%14665, %14666) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14668 = "torch.aten.div.Tensor"(%14667, %1252) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14669 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14670 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14671 = "torch.aten.clamp"(%14668, %14669, %14670) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14672 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14673 = "torch.prims.convert_element_type"(%14671, %14672) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14674 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14675 = "torch.aten.unsqueeze"(%1254, %14674) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %14676 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14677 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %14678 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14679 = "torch.prim.ListConstruct"(%14676, %14677, %14678) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14680 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14681 = "torch.aten.expand"(%14675, %14679, %14680) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %14682 = "torch_c.to_builtin_tensor"(%14673) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14683 = "torch_c.to_builtin_tensor"(%14681) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %14684 = "util.call"(%14682, %14683) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %14685 = "torch_c.from_builtin_tensor"(%14684) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %14686 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14687 = "torch.prims.convert_element_type"(%14685, %14686) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %14688 = "torch.aten.silu"(%14687) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %14689 = "torch.aten.div.Tensor"(%14667, %1256) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14690 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14691 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14692 = "torch.aten.clamp"(%14689, %14690, %14691) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14693 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14694 = "torch.prims.convert_element_type"(%14692, %14693) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14695 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14696 = "torch.aten.unsqueeze"(%1258, %14695) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %14697 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14698 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %14699 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14700 = "torch.prim.ListConstruct"(%14697, %14698, %14699) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14701 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14702 = "torch.aten.expand"(%14696, %14700, %14701) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %14703 = "torch_c.to_builtin_tensor"(%14694) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14704 = "torch_c.to_builtin_tensor"(%14702) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %14705 = "util.call"(%14703, %14704) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %14706 = "torch_c.from_builtin_tensor"(%14705) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %14707 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14708 = "torch.prims.convert_element_type"(%14706, %14707) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %14709 = "torch.aten.mul.Tensor"(%14688, %14708) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %14710 = "torch.aten.div.Tensor"(%14709, %1260) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %14711 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14712 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14713 = "torch.aten.clamp"(%14710, %14711, %14712) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %14714 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14715 = "torch.prims.convert_element_type"(%14713, %14714) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %14716 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14717 = "torch.aten.unsqueeze"(%1262, %14716) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %14718 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14719 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14720 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %14721 = "torch.prim.ListConstruct"(%14718, %14719, %14720) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14722 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14723 = "torch.aten.expand"(%14717, %14721, %14722) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %14724 = "torch_c.to_builtin_tensor"(%14715) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %14725 = "torch_c.to_builtin_tensor"(%14723) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %14726 = "util.call"(%14724, %14725) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %14727 = "torch_c.from_builtin_tensor"(%14726) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %14728 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14729 = "torch.prims.convert_element_type"(%14727, %14728) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14730 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14731 = "torch.aten.add.Tensor"(%14648, %14729, %14730) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14732 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %14733 = "torch.prims.convert_element_type"(%14731, %14732) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14734 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14735 = "torch.aten.pow.Tensor_Scalar"(%14733, %14734) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %14736 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14737 = "torch.prim.ListConstruct"(%14736) : (!torch.int) -> !torch.list<int>
    %14738 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %14739 = "torch.constant.none"() : () -> !torch.none
    %14740 = "torch.aten.mean.dim"(%14735, %14737, %14738, %14739) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %14741 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %14742 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14743 = "torch.aten.add.Scalar"(%14740, %14741, %14742) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %14744 = "torch.aten.rsqrt"(%14743) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %14745 = "torch.aten.mul.Tensor"(%14733, %14744) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14746 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14747 = "torch.prims.convert_element_type"(%14745, %14746) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14748 = "torch.aten.mul.Tensor"(%1264, %14747) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %14749 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %14750 = "torch.prims.convert_element_type"(%14748, %14749) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %14751 = "torch.aten.div.Tensor"(%14750, %1266) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14752 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14753 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14754 = "torch.aten.clamp"(%14751, %14752, %14753) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14755 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14756 = "torch.prims.convert_element_type"(%14754, %14755) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14757 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14758 = "torch.aten.unsqueeze"(%1268, %14757) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %14759 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14760 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14761 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14762 = "torch.prim.ListConstruct"(%14759, %14760, %14761) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14763 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14764 = "torch.aten.expand"(%14758, %14762, %14763) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %14765 = "torch_c.to_builtin_tensor"(%14756) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14766 = "torch_c.to_builtin_tensor"(%14764) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %14767 = "util.call"(%14765, %14766) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %14768 = "torch_c.from_builtin_tensor"(%14767) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %14769 = "torch.aten.div.Tensor"(%14768, %1270) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %14770 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14771 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14772 = "torch.aten.clamp"(%14769, %14770, %14771) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %14773 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14774 = "torch.prims.convert_element_type"(%14772, %14773) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14775 = "torch.aten.div.Tensor"(%14750, %1272) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14776 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14777 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14778 = "torch.aten.clamp"(%14775, %14776, %14777) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14779 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14780 = "torch.prims.convert_element_type"(%14778, %14779) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14781 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14782 = "torch.aten.unsqueeze"(%1274, %14781) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %14783 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14784 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %14785 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14786 = "torch.prim.ListConstruct"(%14783, %14784, %14785) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14787 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14788 = "torch.aten.expand"(%14782, %14786, %14787) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %14789 = "torch_c.to_builtin_tensor"(%14780) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14790 = "torch_c.to_builtin_tensor"(%14788) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %14791 = "util.call"(%14789, %14790) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %14792 = "torch_c.from_builtin_tensor"(%14791) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %14793 = "torch.aten.div.Tensor"(%14792, %1276) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %14794 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14795 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14796 = "torch.aten.clamp"(%14793, %14794, %14795) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %14797 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14798 = "torch.prims.convert_element_type"(%14796, %14797) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %14799 = "torch.aten.div.Tensor"(%14750, %1278) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %14800 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14801 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14802 = "torch.aten.clamp"(%14799, %14800, %14801) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %14803 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14804 = "torch.prims.convert_element_type"(%14802, %14803) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %14805 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14806 = "torch.aten.unsqueeze"(%1280, %14805) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %14807 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14808 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %14809 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %14810 = "torch.prim.ListConstruct"(%14807, %14808, %14809) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14811 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14812 = "torch.aten.expand"(%14806, %14810, %14811) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %14813 = "torch_c.to_builtin_tensor"(%14804) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %14814 = "torch_c.to_builtin_tensor"(%14812) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %14815 = "util.call"(%14813, %14814) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %14816 = "torch_c.from_builtin_tensor"(%14815) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %14817 = "torch.aten.div.Tensor"(%14816, %1282) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %14818 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %14819 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %14820 = "torch.aten.clamp"(%14817, %14818, %14819) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %14821 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %14822 = "torch.prims.convert_element_type"(%14820, %14821) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %14823 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14825 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14826 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14827 = "torch.prim.ListConstruct"(%14823, %14824, %14825, %14826) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14828 = "torch.aten.view"(%14774, %14827) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %14829 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14830 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14831 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14832 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14833 = "torch.prim.ListConstruct"(%14829, %14830, %14831, %14832) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14834 = "torch.aten.view"(%14798, %14833) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %14835 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14836 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14837 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14838 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14839 = "torch.prim.ListConstruct"(%14835, %14836, %14837, %14838) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14840 = "torch.aten.view"(%14822, %14839) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %14841 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14842 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14843 = "torch.aten.transpose.int"(%14828, %14841, %14842) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14844 = "torch.aten.mul.Tensor"(%14843, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14845 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14846 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14847 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14848 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14849 = "torch.aten.slice.Tensor"(%14843, %14845, %14846, %14847, %14848) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %14850 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14851 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14852 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14853 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14854 = "torch.aten.slice.Tensor"(%14843, %14850, %14851, %14852, %14853) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %14855 = "torch.aten.neg"(%14854) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %14856 = "torch.prim.ListConstruct"(%14855, %14849) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %14857 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14858 = "torch.aten.cat"(%14856, %14857) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14859 = "torch.aten.mul.Tensor"(%14858, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14860 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14861 = "torch.aten.add.Tensor"(%14844, %14859, %14860) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %14862 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14863 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14864 = "torch.aten.transpose.int"(%14861, %14862, %14863) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %14865 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14866 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14867 = "torch.aten.transpose.int"(%14834, %14865, %14866) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14868 = "torch.aten.mul.Tensor"(%14867, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14869 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14870 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14871 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14872 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14873 = "torch.aten.slice.Tensor"(%14867, %14869, %14870, %14871, %14872) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %14874 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %14875 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14876 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %14877 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14878 = "torch.aten.slice.Tensor"(%14867, %14874, %14875, %14876, %14877) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %14879 = "torch.aten.neg"(%14878) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %14880 = "torch.prim.ListConstruct"(%14879, %14873) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %14881 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %14882 = "torch.aten.cat"(%14880, %14881) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14883 = "torch.aten.mul.Tensor"(%14882, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14884 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14885 = "torch.aten.add.Tensor"(%14868, %14883, %14884) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %14886 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14887 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14888 = "torch.aten.transpose.int"(%14885, %14886, %14887) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %14889 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14890 = "torch.aten.floor_divide.Scalar"(%arg64, %14889) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14891 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14892 = "torch.aten.unsqueeze"(%14890, %14891) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14893 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14894 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14895 = "torch.aten.gather"(%arg65, %14893, %14892, %14894) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14896 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14897 = "torch.aten.remainder.Scalar"(%arg64, %14896) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14898 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14899 = "torch.aten.unsqueeze"(%14897, %14898) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14900 = "torch.constant.none"() : () -> !torch.none
    %14901 = "torch.aten.clone"(%1283, %14900) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %14902 = "torch.aten.detach"(%14901) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14903 = "torch.aten.detach"(%14902) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14904 = "torch.aten.detach"(%14903) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14905 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14906 = "torch.aten.unsqueeze"(%14904, %14905) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %14907 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14908 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14909 = "torch.prim.ListConstruct"(%14907, %14908) : (!torch.int, !torch.int) -> !torch.list<int>
    %14910 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14911 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14912 = "torch.prim.ListConstruct"(%14910, %14911) : (!torch.int, !torch.int) -> !torch.list<int>
    %14913 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14914 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14915 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %14916 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14917 = "torch.aten.empty_strided"(%14909, %14912, %14913, %14914, %14915, %14916) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14918 = "torch.constant.int"() <{value = 27 : i64}> : () -> !torch.int
    %14919 = "torch.aten.fill.Scalar"(%14917, %14918) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14920 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14921 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14922 = "torch.prim.ListConstruct"(%14920, %14921) : (!torch.int, !torch.int) -> !torch.list<int>
    %14923 = "torch.aten.repeat"(%14906, %14922) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %14924 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14925 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14926 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14927 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14928 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14929 = "torch.prim.ListConstruct"(%1483, %14924, %14925, %14926, %14927, %14928) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14930 = "torch.aten.view"(%14500, %14929) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14930, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14931 = "torch.prim.ListConstruct"(%14895, %14919, %14923, %14899) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %14932 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14933 = "torch.aten.index_put"(%14930, %14931, %14888, %14932) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14933, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14934 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %14935 = "torch.prim.ListConstruct"(%1483, %14934) : (!torch.int, !torch.int) -> !torch.list<int>
    %14936 = "torch.aten.view"(%14933, %14935) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14936, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %14937 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14938 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14939 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14940 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14941 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %14942 = "torch.prim.ListConstruct"(%1483, %14937, %14938, %14939, %14940, %14941) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %14943 = "torch.aten.view"(%14936, %14942) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14943, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14944 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14945 = "torch.aten.floor_divide.Scalar"(%arg64, %14944) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14946 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14947 = "torch.aten.unsqueeze"(%14945, %14946) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14948 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14949 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14950 = "torch.aten.gather"(%arg65, %14948, %14947, %14949) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14951 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14952 = "torch.aten.remainder.Scalar"(%arg64, %14951) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %14953 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14954 = "torch.aten.unsqueeze"(%14952, %14953) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14955 = "torch.constant.none"() : () -> !torch.none
    %14956 = "torch.aten.clone"(%1284, %14955) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %14957 = "torch.aten.detach"(%14956) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14958 = "torch.aten.detach"(%14957) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14959 = "torch.aten.detach"(%14958) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %14960 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14961 = "torch.aten.unsqueeze"(%14959, %14960) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %14962 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14963 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14964 = "torch.prim.ListConstruct"(%14962, %14963) : (!torch.int, !torch.int) -> !torch.list<int>
    %14965 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14966 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14967 = "torch.prim.ListConstruct"(%14965, %14966) : (!torch.int, !torch.int) -> !torch.list<int>
    %14968 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14969 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14970 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %14971 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14972 = "torch.aten.empty_strided"(%14964, %14967, %14968, %14969, %14970, %14971) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %14973 = "torch.constant.int"() <{value = 27 : i64}> : () -> !torch.int
    %14974 = "torch.aten.fill.Scalar"(%14972, %14973) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %14975 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %14976 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14977 = "torch.prim.ListConstruct"(%14975, %14976) : (!torch.int, !torch.int) -> !torch.list<int>
    %14978 = "torch.aten.repeat"(%14961, %14977) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %14979 = "torch.prim.ListConstruct"(%14950, %14974, %14978, %14954) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %14980 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %14981 = "torch.aten.index_put"(%14943, %14979, %14840, %14980) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14981, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %14982 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %14983 = "torch.prim.ListConstruct"(%1483, %14982) : (!torch.int, !torch.int) -> !torch.list<int>
    %14984 = "torch.aten.view"(%14981, %14983) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%14984, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %14985 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %14986 = "torch.aten.mul.Scalar"(%arg65, %14985) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14986, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14987 = "torch.constant.int"() <{value = 54 : i64}> : () -> !torch.int
    %14988 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14989 = "torch.aten.add.Scalar"(%14986, %14987, %14988) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14989, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14990 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %14991 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %14992 = "torch.aten.add.Scalar"(%14989, %14990, %14991) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%14992, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %14993 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %14994 = "torch.aten.view"(%14992, %14993) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%14994, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %14995 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14996 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %14997 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %14998 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %14999 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15000 = "torch.prim.ListConstruct"(%1483, %14995, %14996, %14997, %14998, %14999) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15001 = "torch.aten.view"(%14984, %15000) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15001, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15002 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15003 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15004 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15005 = "torch.prim.ListConstruct"(%1914, %15002, %15003, %15004) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15006 = "torch.aten.view"(%15001, %15005) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15006, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15007 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15008 = "torch.aten.index_select"(%15006, %15007, %14994) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15008, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15009 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15010 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15011 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15012 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15013 = "torch.prim.ListConstruct"(%15009, %1481, %15010, %15011, %15012) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15014 = "torch.aten.view"(%15008, %15013) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15014, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15015 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15016 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15017 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15018 = "torch.prim.ListConstruct"(%15015, %1485, %15016, %15017) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15019 = "torch.aten.view"(%15014, %15018) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15019, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15020 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15021 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15022 = "torch.aten.add.Scalar"(%14989, %15020, %15021) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15022, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15023 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %15024 = "torch.aten.view"(%15022, %15023) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%15024, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %15025 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15026 = "torch.aten.index_select"(%15006, %15025, %15024) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15026, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15027 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15028 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15029 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15030 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15031 = "torch.prim.ListConstruct"(%15027, %1481, %15028, %15029, %15030) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15032 = "torch.aten.view"(%15026, %15031) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15032, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15033 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15034 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15035 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15036 = "torch.prim.ListConstruct"(%15033, %1485, %15034, %15035) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15037 = "torch.aten.view"(%15032, %15036) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15037, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15038 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15039 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15040 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15041 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15042 = "torch.aten.slice.Tensor"(%15019, %15038, %15039, %15040, %15041) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15042, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15043 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15044 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15045 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15046 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15047 = "torch.aten.slice.Tensor"(%15037, %15043, %15044, %15045, %15046) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15047, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15048 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %15049 = "torch.aten.unsqueeze"(%15042, %15048) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15049, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15050 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15051 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15052 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15053 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15054 = "torch.prim.ListConstruct"(%15050, %1485, %15051, %15052, %15053) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15055 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15056 = "torch.aten.expand"(%15049, %15054, %15055) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15056, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15057 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15058 = "torch.aten.clone"(%15056, %15057) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15058, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15059 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15060 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15061 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15062 = "torch.prim.ListConstruct"(%15059, %1485, %15060, %15061) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15063 = "torch.aten._unsafe_view"(%15058, %15062) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15063, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15064 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %15065 = "torch.aten.unsqueeze"(%15047, %15064) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15065, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15066 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15067 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15068 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15069 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15070 = "torch.prim.ListConstruct"(%15066, %1485, %15067, %15068, %15069) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15071 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15072 = "torch.aten.expand"(%15065, %15070, %15071) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15072, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15073 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15074 = "torch.aten.clone"(%15072, %15073) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15074, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15075 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15076 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15077 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15078 = "torch.prim.ListConstruct"(%15075, %1485, %15076, %15077) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15079 = "torch.aten._unsafe_view"(%15074, %15078) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15079, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15080 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15081 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15082 = "torch.aten.transpose.int"(%14864, %15080, %15081) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15083 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15084 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15085 = "torch.aten.transpose.int"(%15063, %15083, %15084) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15085, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15086 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15087 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15088 = "torch.aten.transpose.int"(%15079, %15086, %15087) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15088, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15089 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15090 = "torch.aten.squeeze.dim"(%1516, %15089) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15090, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %15091 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15092 = "torch.aten.squeeze.dim"(%15090, %15091) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15092, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %15093 = "torch_c.to_builtin_tensor"(%15082) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %15094 = "tensor.cast"(%15093) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %15095 = "torch_c.to_builtin_tensor"(%15085) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %15096 = "torch_c.to_builtin_tensor"(%15088) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %15097 = "torch_c.to_builtin_tensor"(%15092) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %15098 = "tensor.cast"(%15097) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %15099 = "torch_c.to_builtin_tensor"(%1286) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %15100 = "util.call"(%15094, %15095, %15096, %15099, %15098) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %15101 = "tensor.cast"(%15100) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %15102 = "torch_c.from_builtin_tensor"(%15101) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %15103 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15104 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15105 = "torch.aten.transpose.int"(%15102, %15103, %15104) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %15106 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15107 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15108 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15109 = "torch.prim.ListConstruct"(%15106, %15107, %15108) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15110 = "torch.aten.view"(%15105, %15109) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %15111 = "torch.aten.div.Tensor"(%15110, %1288) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15112 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15113 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15114 = "torch.aten.clamp"(%15111, %15112, %15113) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %15115 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15116 = "torch.prims.convert_element_type"(%15114, %15115) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15117 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15118 = "torch.aten.unsqueeze"(%1290, %15117) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %15119 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15120 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15121 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15122 = "torch.prim.ListConstruct"(%15119, %15120, %15121) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15123 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15124 = "torch.aten.expand"(%15118, %15122, %15123) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %15125 = "torch_c.to_builtin_tensor"(%15116) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15126 = "torch_c.to_builtin_tensor"(%15124) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %15127 = "util.call"(%15125, %15126) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %15128 = "torch_c.from_builtin_tensor"(%15127) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %15129 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15130 = "torch.prims.convert_element_type"(%15128, %15129) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15131 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15132 = "torch.aten.add.Tensor"(%14731, %15130, %15131) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15133 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %15134 = "torch.prims.convert_element_type"(%15132, %15133) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15135 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15136 = "torch.aten.pow.Tensor_Scalar"(%15134, %15135) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15137 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15138 = "torch.prim.ListConstruct"(%15137) : (!torch.int) -> !torch.list<int>
    %15139 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %15140 = "torch.constant.none"() : () -> !torch.none
    %15141 = "torch.aten.mean.dim"(%15136, %15138, %15139, %15140) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %15142 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %15143 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15144 = "torch.aten.add.Scalar"(%15141, %15142, %15143) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %15145 = "torch.aten.rsqrt"(%15144) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %15146 = "torch.aten.mul.Tensor"(%15134, %15145) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15147 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15148 = "torch.prims.convert_element_type"(%15146, %15147) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15149 = "torch.aten.mul.Tensor"(%1292, %15148) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %15150 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15151 = "torch.prims.convert_element_type"(%15149, %15150) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15152 = "torch.aten.div.Tensor"(%15151, %1294) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15153 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15154 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15155 = "torch.aten.clamp"(%15152, %15153, %15154) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15156 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15157 = "torch.prims.convert_element_type"(%15155, %15156) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15158 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15159 = "torch.aten.unsqueeze"(%1296, %15158) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %15160 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15161 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %15162 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15163 = "torch.prim.ListConstruct"(%15160, %15161, %15162) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15164 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15165 = "torch.aten.expand"(%15159, %15163, %15164) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %15166 = "torch_c.to_builtin_tensor"(%15157) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15167 = "torch_c.to_builtin_tensor"(%15165) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %15168 = "util.call"(%15166, %15167) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %15169 = "torch_c.from_builtin_tensor"(%15168) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %15170 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15171 = "torch.prims.convert_element_type"(%15169, %15170) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %15172 = "torch.aten.silu"(%15171) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %15173 = "torch.aten.div.Tensor"(%15151, %1298) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15174 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15175 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15176 = "torch.aten.clamp"(%15173, %15174, %15175) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15177 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15178 = "torch.prims.convert_element_type"(%15176, %15177) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15179 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15180 = "torch.aten.unsqueeze"(%1300, %15179) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %15181 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15182 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %15183 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15184 = "torch.prim.ListConstruct"(%15181, %15182, %15183) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15185 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15186 = "torch.aten.expand"(%15180, %15184, %15185) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %15187 = "torch_c.to_builtin_tensor"(%15178) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15188 = "torch_c.to_builtin_tensor"(%15186) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %15189 = "util.call"(%15187, %15188) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %15190 = "torch_c.from_builtin_tensor"(%15189) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %15191 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15192 = "torch.prims.convert_element_type"(%15190, %15191) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %15193 = "torch.aten.mul.Tensor"(%15172, %15192) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %15194 = "torch.aten.div.Tensor"(%15193, %1302) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %15195 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15196 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15197 = "torch.aten.clamp"(%15194, %15195, %15196) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %15198 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15199 = "torch.prims.convert_element_type"(%15197, %15198) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %15200 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15201 = "torch.aten.unsqueeze"(%1304, %15200) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %15202 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15203 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15204 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %15205 = "torch.prim.ListConstruct"(%15202, %15203, %15204) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15206 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15207 = "torch.aten.expand"(%15201, %15205, %15206) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %15208 = "torch_c.to_builtin_tensor"(%15199) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %15209 = "torch_c.to_builtin_tensor"(%15207) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %15210 = "util.call"(%15208, %15209) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %15211 = "torch_c.from_builtin_tensor"(%15210) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %15212 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15213 = "torch.prims.convert_element_type"(%15211, %15212) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15215 = "torch.aten.add.Tensor"(%15132, %15213, %15214) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15216 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %15217 = "torch.prims.convert_element_type"(%15215, %15216) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15218 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15219 = "torch.aten.pow.Tensor_Scalar"(%15217, %15218) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15220 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15221 = "torch.prim.ListConstruct"(%15220) : (!torch.int) -> !torch.list<int>
    %15222 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %15223 = "torch.constant.none"() : () -> !torch.none
    %15224 = "torch.aten.mean.dim"(%15219, %15221, %15222, %15223) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %15225 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %15226 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15227 = "torch.aten.add.Scalar"(%15224, %15225, %15226) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %15228 = "torch.aten.rsqrt"(%15227) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %15229 = "torch.aten.mul.Tensor"(%15217, %15228) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15230 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15231 = "torch.prims.convert_element_type"(%15229, %15230) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15232 = "torch.aten.mul.Tensor"(%1306, %15231) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %15233 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15234 = "torch.prims.convert_element_type"(%15232, %15233) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15235 = "torch.aten.div.Tensor"(%15234, %1308) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15236 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15237 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15238 = "torch.aten.clamp"(%15235, %15236, %15237) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15239 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15240 = "torch.prims.convert_element_type"(%15238, %15239) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15241 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15242 = "torch.aten.unsqueeze"(%1310, %15241) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %15243 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15244 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15245 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15246 = "torch.prim.ListConstruct"(%15243, %15244, %15245) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15247 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15248 = "torch.aten.expand"(%15242, %15246, %15247) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %15249 = "torch_c.to_builtin_tensor"(%15240) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15250 = "torch_c.to_builtin_tensor"(%15248) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %15251 = "util.call"(%15249, %15250) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %15252 = "torch_c.from_builtin_tensor"(%15251) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %15253 = "torch.aten.div.Tensor"(%15252, %1312) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15254 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15255 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15256 = "torch.aten.clamp"(%15253, %15254, %15255) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %15257 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15258 = "torch.prims.convert_element_type"(%15256, %15257) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15259 = "torch.aten.div.Tensor"(%15234, %1314) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15260 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15261 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15262 = "torch.aten.clamp"(%15259, %15260, %15261) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15263 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15264 = "torch.prims.convert_element_type"(%15262, %15263) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15266 = "torch.aten.unsqueeze"(%1316, %15265) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %15267 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15268 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %15269 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15270 = "torch.prim.ListConstruct"(%15267, %15268, %15269) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15271 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15272 = "torch.aten.expand"(%15266, %15270, %15271) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %15273 = "torch_c.to_builtin_tensor"(%15264) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15274 = "torch_c.to_builtin_tensor"(%15272) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %15275 = "util.call"(%15273, %15274) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %15276 = "torch_c.from_builtin_tensor"(%15275) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %15277 = "torch.aten.div.Tensor"(%15276, %1318) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %15278 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15279 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15280 = "torch.aten.clamp"(%15277, %15278, %15279) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %15281 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15282 = "torch.prims.convert_element_type"(%15280, %15281) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %15283 = "torch.aten.div.Tensor"(%15234, %1320) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15284 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15285 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15286 = "torch.aten.clamp"(%15283, %15284, %15285) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15287 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15288 = "torch.prims.convert_element_type"(%15286, %15287) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15289 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15290 = "torch.aten.unsqueeze"(%1322, %15289) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %15291 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15292 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %15293 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15294 = "torch.prim.ListConstruct"(%15291, %15292, %15293) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15295 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15296 = "torch.aten.expand"(%15290, %15294, %15295) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %15297 = "torch_c.to_builtin_tensor"(%15288) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15298 = "torch_c.to_builtin_tensor"(%15296) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %15299 = "util.call"(%15297, %15298) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %15300 = "torch_c.from_builtin_tensor"(%15299) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %15301 = "torch.aten.div.Tensor"(%15300, %1324) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %15302 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15303 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15304 = "torch.aten.clamp"(%15301, %15302, %15303) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %15305 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15306 = "torch.prims.convert_element_type"(%15304, %15305) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %15307 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15308 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15309 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15310 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15311 = "torch.prim.ListConstruct"(%15307, %15308, %15309, %15310) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15312 = "torch.aten.view"(%15258, %15311) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %15313 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15314 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15315 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15316 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15317 = "torch.prim.ListConstruct"(%15313, %15314, %15315, %15316) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15318 = "torch.aten.view"(%15282, %15317) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %15319 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15320 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15321 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15322 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15323 = "torch.prim.ListConstruct"(%15319, %15320, %15321, %15322) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15324 = "torch.aten.view"(%15306, %15323) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %15325 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15326 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15327 = "torch.aten.transpose.int"(%15312, %15325, %15326) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15328 = "torch.aten.mul.Tensor"(%15327, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15329 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15330 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15331 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15332 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15333 = "torch.aten.slice.Tensor"(%15327, %15329, %15330, %15331, %15332) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %15334 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15335 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15336 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15337 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15338 = "torch.aten.slice.Tensor"(%15327, %15334, %15335, %15336, %15337) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %15339 = "torch.aten.neg"(%15338) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %15340 = "torch.prim.ListConstruct"(%15339, %15333) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %15341 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15342 = "torch.aten.cat"(%15340, %15341) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15343 = "torch.aten.mul.Tensor"(%15342, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15344 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15345 = "torch.aten.add.Tensor"(%15328, %15343, %15344) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15346 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15347 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15348 = "torch.aten.transpose.int"(%15345, %15346, %15347) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %15349 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15350 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15351 = "torch.aten.transpose.int"(%15318, %15349, %15350) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15352 = "torch.aten.mul.Tensor"(%15351, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15353 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15354 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15355 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15356 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15357 = "torch.aten.slice.Tensor"(%15351, %15353, %15354, %15355, %15356) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %15358 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15359 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15360 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15361 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15362 = "torch.aten.slice.Tensor"(%15351, %15358, %15359, %15360, %15361) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %15363 = "torch.aten.neg"(%15362) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %15364 = "torch.prim.ListConstruct"(%15363, %15357) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %15365 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15366 = "torch.aten.cat"(%15364, %15365) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15367 = "torch.aten.mul.Tensor"(%15366, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15368 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15369 = "torch.aten.add.Tensor"(%15352, %15367, %15368) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15370 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15371 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15372 = "torch.aten.transpose.int"(%15369, %15370, %15371) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %15373 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15374 = "torch.aten.floor_divide.Scalar"(%arg64, %15373) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15375 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15376 = "torch.aten.unsqueeze"(%15374, %15375) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15377 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15378 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15379 = "torch.aten.gather"(%arg65, %15377, %15376, %15378) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15380 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15381 = "torch.aten.remainder.Scalar"(%arg64, %15380) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15382 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15383 = "torch.aten.unsqueeze"(%15381, %15382) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15384 = "torch.constant.none"() : () -> !torch.none
    %15385 = "torch.aten.clone"(%1325, %15384) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %15386 = "torch.aten.detach"(%15385) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15387 = "torch.aten.detach"(%15386) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15388 = "torch.aten.detach"(%15387) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15389 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15390 = "torch.aten.unsqueeze"(%15388, %15389) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %15391 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15392 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15393 = "torch.prim.ListConstruct"(%15391, %15392) : (!torch.int, !torch.int) -> !torch.list<int>
    %15394 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15395 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15396 = "torch.prim.ListConstruct"(%15394, %15395) : (!torch.int, !torch.int) -> !torch.list<int>
    %15397 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15398 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15399 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %15400 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15401 = "torch.aten.empty_strided"(%15393, %15396, %15397, %15398, %15399, %15400) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15402 = "torch.constant.int"() <{value = 28 : i64}> : () -> !torch.int
    %15403 = "torch.aten.fill.Scalar"(%15401, %15402) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15404 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15405 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15406 = "torch.prim.ListConstruct"(%15404, %15405) : (!torch.int, !torch.int) -> !torch.list<int>
    %15407 = "torch.aten.repeat"(%15390, %15406) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %15408 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15409 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15410 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15411 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15412 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15413 = "torch.prim.ListConstruct"(%1483, %15408, %15409, %15410, %15411, %15412) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15414 = "torch.aten.view"(%14984, %15413) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15414, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15415 = "torch.prim.ListConstruct"(%15379, %15403, %15407, %15383) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %15416 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15417 = "torch.aten.index_put"(%15414, %15415, %15372, %15416) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15417, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15418 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %15419 = "torch.prim.ListConstruct"(%1483, %15418) : (!torch.int, !torch.int) -> !torch.list<int>
    %15420 = "torch.aten.view"(%15417, %15419) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15420, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %15421 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15422 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15423 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15424 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15425 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15426 = "torch.prim.ListConstruct"(%1483, %15421, %15422, %15423, %15424, %15425) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15427 = "torch.aten.view"(%15420, %15426) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15427, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15428 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15429 = "torch.aten.floor_divide.Scalar"(%arg64, %15428) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15430 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15431 = "torch.aten.unsqueeze"(%15429, %15430) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15432 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15433 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15434 = "torch.aten.gather"(%arg65, %15432, %15431, %15433) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15435 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15436 = "torch.aten.remainder.Scalar"(%arg64, %15435) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15437 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15438 = "torch.aten.unsqueeze"(%15436, %15437) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15439 = "torch.constant.none"() : () -> !torch.none
    %15440 = "torch.aten.clone"(%1326, %15439) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %15441 = "torch.aten.detach"(%15440) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15442 = "torch.aten.detach"(%15441) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15443 = "torch.aten.detach"(%15442) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15444 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15445 = "torch.aten.unsqueeze"(%15443, %15444) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %15446 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15447 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15448 = "torch.prim.ListConstruct"(%15446, %15447) : (!torch.int, !torch.int) -> !torch.list<int>
    %15449 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15450 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15451 = "torch.prim.ListConstruct"(%15449, %15450) : (!torch.int, !torch.int) -> !torch.list<int>
    %15452 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15453 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15454 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %15455 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15456 = "torch.aten.empty_strided"(%15448, %15451, %15452, %15453, %15454, %15455) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15457 = "torch.constant.int"() <{value = 28 : i64}> : () -> !torch.int
    %15458 = "torch.aten.fill.Scalar"(%15456, %15457) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15459 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15460 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15461 = "torch.prim.ListConstruct"(%15459, %15460) : (!torch.int, !torch.int) -> !torch.list<int>
    %15462 = "torch.aten.repeat"(%15445, %15461) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %15463 = "torch.prim.ListConstruct"(%15434, %15458, %15462, %15438) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %15464 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15465 = "torch.aten.index_put"(%15427, %15463, %15324, %15464) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15465, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15466 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %15467 = "torch.prim.ListConstruct"(%1483, %15466) : (!torch.int, !torch.int) -> !torch.list<int>
    %15468 = "torch.aten.view"(%15465, %15467) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15468, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %15469 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15470 = "torch.aten.mul.Scalar"(%arg65, %15469) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15470, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15471 = "torch.constant.int"() <{value = 56 : i64}> : () -> !torch.int
    %15472 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15473 = "torch.aten.add.Scalar"(%15470, %15471, %15472) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15473, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15474 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15475 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15476 = "torch.aten.add.Scalar"(%15473, %15474, %15475) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15476, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15477 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %15478 = "torch.aten.view"(%15476, %15477) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%15478, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %15479 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15480 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15481 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15482 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15483 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15484 = "torch.prim.ListConstruct"(%1483, %15479, %15480, %15481, %15482, %15483) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15485 = "torch.aten.view"(%15468, %15484) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15485, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15486 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15487 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15488 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15489 = "torch.prim.ListConstruct"(%1914, %15486, %15487, %15488) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15490 = "torch.aten.view"(%15485, %15489) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15490, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15491 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15492 = "torch.aten.index_select"(%15490, %15491, %15478) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15492, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15493 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15494 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15495 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15496 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15497 = "torch.prim.ListConstruct"(%15493, %1481, %15494, %15495, %15496) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15498 = "torch.aten.view"(%15492, %15497) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15498, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15499 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15500 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15501 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15502 = "torch.prim.ListConstruct"(%15499, %1485, %15500, %15501) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15503 = "torch.aten.view"(%15498, %15502) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15503, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15504 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15505 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15506 = "torch.aten.add.Scalar"(%15473, %15504, %15505) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15506, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15507 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %15508 = "torch.aten.view"(%15506, %15507) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%15508, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %15509 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15510 = "torch.aten.index_select"(%15490, %15509, %15508) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15510, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15511 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15512 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15513 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15514 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15515 = "torch.prim.ListConstruct"(%15511, %1481, %15512, %15513, %15514) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15516 = "torch.aten.view"(%15510, %15515) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15516, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15517 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15518 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15519 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15520 = "torch.prim.ListConstruct"(%15517, %1485, %15518, %15519) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15521 = "torch.aten.view"(%15516, %15520) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15521, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15522 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15523 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15524 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15525 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15526 = "torch.aten.slice.Tensor"(%15503, %15522, %15523, %15524, %15525) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15526, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15527 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15528 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15529 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15530 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15531 = "torch.aten.slice.Tensor"(%15521, %15527, %15528, %15529, %15530) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15531, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15532 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %15533 = "torch.aten.unsqueeze"(%15526, %15532) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15533, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15534 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15535 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15536 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15537 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15538 = "torch.prim.ListConstruct"(%15534, %1485, %15535, %15536, %15537) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15539 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15540 = "torch.aten.expand"(%15533, %15538, %15539) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15540, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15541 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15542 = "torch.aten.clone"(%15540, %15541) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15542, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15543 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15544 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15545 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15546 = "torch.prim.ListConstruct"(%15543, %1485, %15544, %15545) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15547 = "torch.aten._unsafe_view"(%15542, %15546) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15547, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15548 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %15549 = "torch.aten.unsqueeze"(%15531, %15548) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15549, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15550 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15551 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15552 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15553 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15554 = "torch.prim.ListConstruct"(%15550, %1485, %15551, %15552, %15553) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15555 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15556 = "torch.aten.expand"(%15549, %15554, %15555) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15556, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15557 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15558 = "torch.aten.clone"(%15556, %15557) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15558, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15559 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15560 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15561 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15562 = "torch.prim.ListConstruct"(%15559, %1485, %15560, %15561) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15563 = "torch.aten._unsafe_view"(%15558, %15562) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15563, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15565 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15566 = "torch.aten.transpose.int"(%15348, %15564, %15565) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15567 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15568 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15569 = "torch.aten.transpose.int"(%15547, %15567, %15568) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15569, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15570 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15571 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15572 = "torch.aten.transpose.int"(%15563, %15570, %15571) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15572, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15573 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15574 = "torch.aten.squeeze.dim"(%1516, %15573) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15574, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %15575 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15576 = "torch.aten.squeeze.dim"(%15574, %15575) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15576, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %15577 = "torch_c.to_builtin_tensor"(%15566) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %15578 = "tensor.cast"(%15577) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %15579 = "torch_c.to_builtin_tensor"(%15569) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %15580 = "torch_c.to_builtin_tensor"(%15572) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %15581 = "torch_c.to_builtin_tensor"(%15576) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %15582 = "tensor.cast"(%15581) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %15583 = "torch_c.to_builtin_tensor"(%1328) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %15584 = "util.call"(%15578, %15579, %15580, %15583, %15582) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %15585 = "tensor.cast"(%15584) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %15586 = "torch_c.from_builtin_tensor"(%15585) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %15587 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15588 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15589 = "torch.aten.transpose.int"(%15586, %15587, %15588) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %15590 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15591 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15592 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15593 = "torch.prim.ListConstruct"(%15590, %15591, %15592) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15594 = "torch.aten.view"(%15589, %15593) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %15595 = "torch.aten.div.Tensor"(%15594, %1330) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15596 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15597 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15598 = "torch.aten.clamp"(%15595, %15596, %15597) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %15599 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15600 = "torch.prims.convert_element_type"(%15598, %15599) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15601 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15602 = "torch.aten.unsqueeze"(%1332, %15601) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %15603 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15604 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15605 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15606 = "torch.prim.ListConstruct"(%15603, %15604, %15605) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15607 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15608 = "torch.aten.expand"(%15602, %15606, %15607) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %15609 = "torch_c.to_builtin_tensor"(%15600) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15610 = "torch_c.to_builtin_tensor"(%15608) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %15611 = "util.call"(%15609, %15610) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %15612 = "torch_c.from_builtin_tensor"(%15611) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %15613 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15614 = "torch.prims.convert_element_type"(%15612, %15613) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15615 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15616 = "torch.aten.add.Tensor"(%15215, %15614, %15615) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15617 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %15618 = "torch.prims.convert_element_type"(%15616, %15617) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15619 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15620 = "torch.aten.pow.Tensor_Scalar"(%15618, %15619) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15621 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15622 = "torch.prim.ListConstruct"(%15621) : (!torch.int) -> !torch.list<int>
    %15623 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %15624 = "torch.constant.none"() : () -> !torch.none
    %15625 = "torch.aten.mean.dim"(%15620, %15622, %15623, %15624) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %15626 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %15627 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15628 = "torch.aten.add.Scalar"(%15625, %15626, %15627) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %15629 = "torch.aten.rsqrt"(%15628) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %15630 = "torch.aten.mul.Tensor"(%15618, %15629) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15631 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15632 = "torch.prims.convert_element_type"(%15630, %15631) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15633 = "torch.aten.mul.Tensor"(%1334, %15632) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %15634 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15635 = "torch.prims.convert_element_type"(%15633, %15634) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15636 = "torch.aten.div.Tensor"(%15635, %1336) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15637 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15638 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15639 = "torch.aten.clamp"(%15636, %15637, %15638) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15640 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15641 = "torch.prims.convert_element_type"(%15639, %15640) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15642 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15643 = "torch.aten.unsqueeze"(%1338, %15642) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %15644 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15645 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %15646 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15647 = "torch.prim.ListConstruct"(%15644, %15645, %15646) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15648 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15649 = "torch.aten.expand"(%15643, %15647, %15648) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %15650 = "torch_c.to_builtin_tensor"(%15641) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15651 = "torch_c.to_builtin_tensor"(%15649) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %15652 = "util.call"(%15650, %15651) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %15653 = "torch_c.from_builtin_tensor"(%15652) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %15654 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15655 = "torch.prims.convert_element_type"(%15653, %15654) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %15656 = "torch.aten.silu"(%15655) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %15657 = "torch.aten.div.Tensor"(%15635, %1340) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15658 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15659 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15660 = "torch.aten.clamp"(%15657, %15658, %15659) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15661 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15662 = "torch.prims.convert_element_type"(%15660, %15661) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15663 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15664 = "torch.aten.unsqueeze"(%1342, %15663) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %15665 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15666 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %15667 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15668 = "torch.prim.ListConstruct"(%15665, %15666, %15667) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15669 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15670 = "torch.aten.expand"(%15664, %15668, %15669) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %15671 = "torch_c.to_builtin_tensor"(%15662) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15672 = "torch_c.to_builtin_tensor"(%15670) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %15673 = "util.call"(%15671, %15672) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %15674 = "torch_c.from_builtin_tensor"(%15673) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %15675 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15676 = "torch.prims.convert_element_type"(%15674, %15675) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %15677 = "torch.aten.mul.Tensor"(%15656, %15676) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %15678 = "torch.aten.div.Tensor"(%15677, %1344) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %15679 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15680 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15681 = "torch.aten.clamp"(%15678, %15679, %15680) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %15682 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15683 = "torch.prims.convert_element_type"(%15681, %15682) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %15684 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15685 = "torch.aten.unsqueeze"(%1346, %15684) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %15686 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15687 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15688 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %15689 = "torch.prim.ListConstruct"(%15686, %15687, %15688) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15690 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15691 = "torch.aten.expand"(%15685, %15689, %15690) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %15692 = "torch_c.to_builtin_tensor"(%15683) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %15693 = "torch_c.to_builtin_tensor"(%15691) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %15694 = "util.call"(%15692, %15693) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %15695 = "torch_c.from_builtin_tensor"(%15694) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %15696 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15697 = "torch.prims.convert_element_type"(%15695, %15696) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15698 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15699 = "torch.aten.add.Tensor"(%15616, %15697, %15698) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15700 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %15701 = "torch.prims.convert_element_type"(%15699, %15700) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15702 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15703 = "torch.aten.pow.Tensor_Scalar"(%15701, %15702) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %15704 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15705 = "torch.prim.ListConstruct"(%15704) : (!torch.int) -> !torch.list<int>
    %15706 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %15707 = "torch.constant.none"() : () -> !torch.none
    %15708 = "torch.aten.mean.dim"(%15703, %15705, %15706, %15707) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %15709 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %15710 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15711 = "torch.aten.add.Scalar"(%15708, %15709, %15710) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %15712 = "torch.aten.rsqrt"(%15711) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %15713 = "torch.aten.mul.Tensor"(%15701, %15712) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15714 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15715 = "torch.prims.convert_element_type"(%15713, %15714) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15716 = "torch.aten.mul.Tensor"(%1348, %15715) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %15717 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %15718 = "torch.prims.convert_element_type"(%15716, %15717) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %15719 = "torch.aten.div.Tensor"(%15718, %1350) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15720 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15721 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15722 = "torch.aten.clamp"(%15719, %15720, %15721) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15723 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15724 = "torch.prims.convert_element_type"(%15722, %15723) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15725 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15726 = "torch.aten.unsqueeze"(%1352, %15725) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %15727 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15728 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15729 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15730 = "torch.prim.ListConstruct"(%15727, %15728, %15729) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15731 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15732 = "torch.aten.expand"(%15726, %15730, %15731) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %15733 = "torch_c.to_builtin_tensor"(%15724) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15734 = "torch_c.to_builtin_tensor"(%15732) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %15735 = "util.call"(%15733, %15734) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %15736 = "torch_c.from_builtin_tensor"(%15735) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %15737 = "torch.aten.div.Tensor"(%15736, %1354) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %15738 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15739 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15740 = "torch.aten.clamp"(%15737, %15738, %15739) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %15741 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15742 = "torch.prims.convert_element_type"(%15740, %15741) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15743 = "torch.aten.div.Tensor"(%15718, %1356) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15744 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15745 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15746 = "torch.aten.clamp"(%15743, %15744, %15745) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15747 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15748 = "torch.prims.convert_element_type"(%15746, %15747) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15749 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15750 = "torch.aten.unsqueeze"(%1358, %15749) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %15751 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15752 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %15753 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15754 = "torch.prim.ListConstruct"(%15751, %15752, %15753) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15755 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15756 = "torch.aten.expand"(%15750, %15754, %15755) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %15757 = "torch_c.to_builtin_tensor"(%15748) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15758 = "torch_c.to_builtin_tensor"(%15756) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %15759 = "util.call"(%15757, %15758) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %15760 = "torch_c.from_builtin_tensor"(%15759) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %15761 = "torch.aten.div.Tensor"(%15760, %1360) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %15762 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15763 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15764 = "torch.aten.clamp"(%15761, %15762, %15763) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %15765 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15766 = "torch.prims.convert_element_type"(%15764, %15765) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %15767 = "torch.aten.div.Tensor"(%15718, %1362) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %15768 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15769 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15770 = "torch.aten.clamp"(%15767, %15768, %15769) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %15771 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15772 = "torch.prims.convert_element_type"(%15770, %15771) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %15773 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15774 = "torch.aten.unsqueeze"(%1364, %15773) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %15775 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15776 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %15777 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %15778 = "torch.prim.ListConstruct"(%15775, %15776, %15777) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15779 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15780 = "torch.aten.expand"(%15774, %15778, %15779) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %15781 = "torch_c.to_builtin_tensor"(%15772) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %15782 = "torch_c.to_builtin_tensor"(%15780) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %15783 = "util.call"(%15781, %15782) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %15784 = "torch_c.from_builtin_tensor"(%15783) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %15785 = "torch.aten.div.Tensor"(%15784, %1366) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %15786 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %15787 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %15788 = "torch.aten.clamp"(%15785, %15786, %15787) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %15789 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %15790 = "torch.prims.convert_element_type"(%15788, %15789) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %15791 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15792 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15793 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15794 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15795 = "torch.prim.ListConstruct"(%15791, %15792, %15793, %15794) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15796 = "torch.aten.view"(%15742, %15795) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %15797 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15798 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15799 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15800 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15801 = "torch.prim.ListConstruct"(%15797, %15798, %15799, %15800) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15802 = "torch.aten.view"(%15766, %15801) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %15803 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15804 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15805 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15806 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15807 = "torch.prim.ListConstruct"(%15803, %15804, %15805, %15806) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15808 = "torch.aten.view"(%15790, %15807) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %15809 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15810 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15811 = "torch.aten.transpose.int"(%15796, %15809, %15810) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15812 = "torch.aten.mul.Tensor"(%15811, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15813 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15814 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15815 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15816 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15817 = "torch.aten.slice.Tensor"(%15811, %15813, %15814, %15815, %15816) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %15818 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15819 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15820 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15821 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15822 = "torch.aten.slice.Tensor"(%15811, %15818, %15819, %15820, %15821) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %15823 = "torch.aten.neg"(%15822) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %15824 = "torch.prim.ListConstruct"(%15823, %15817) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %15825 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15826 = "torch.aten.cat"(%15824, %15825) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15827 = "torch.aten.mul.Tensor"(%15826, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15828 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15829 = "torch.aten.add.Tensor"(%15812, %15827, %15828) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %15830 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15831 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15832 = "torch.aten.transpose.int"(%15829, %15830, %15831) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %15833 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15834 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15835 = "torch.aten.transpose.int"(%15802, %15833, %15834) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15836 = "torch.aten.mul.Tensor"(%15835, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15837 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15838 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15839 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15840 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15841 = "torch.aten.slice.Tensor"(%15835, %15837, %15838, %15839, %15840) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %15842 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %15843 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15844 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %15845 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15846 = "torch.aten.slice.Tensor"(%15835, %15842, %15843, %15844, %15845) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %15847 = "torch.aten.neg"(%15846) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %15848 = "torch.prim.ListConstruct"(%15847, %15841) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %15849 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %15850 = "torch.aten.cat"(%15848, %15849) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15851 = "torch.aten.mul.Tensor"(%15850, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15852 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15853 = "torch.aten.add.Tensor"(%15836, %15851, %15852) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %15854 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15855 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15856 = "torch.aten.transpose.int"(%15853, %15854, %15855) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %15857 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15858 = "torch.aten.floor_divide.Scalar"(%arg64, %15857) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15859 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15860 = "torch.aten.unsqueeze"(%15858, %15859) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15861 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15862 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15863 = "torch.aten.gather"(%arg65, %15861, %15860, %15862) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15864 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15865 = "torch.aten.remainder.Scalar"(%arg64, %15864) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15867 = "torch.aten.unsqueeze"(%15865, %15866) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15868 = "torch.constant.none"() : () -> !torch.none
    %15869 = "torch.aten.clone"(%1367, %15868) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %15870 = "torch.aten.detach"(%15869) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15871 = "torch.aten.detach"(%15870) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15872 = "torch.aten.detach"(%15871) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15873 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15874 = "torch.aten.unsqueeze"(%15872, %15873) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %15875 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15876 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15877 = "torch.prim.ListConstruct"(%15875, %15876) : (!torch.int, !torch.int) -> !torch.list<int>
    %15878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15880 = "torch.prim.ListConstruct"(%15878, %15879) : (!torch.int, !torch.int) -> !torch.list<int>
    %15881 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15882 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15883 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %15884 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15885 = "torch.aten.empty_strided"(%15877, %15880, %15881, %15882, %15883, %15884) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15886 = "torch.constant.int"() <{value = 29 : i64}> : () -> !torch.int
    %15887 = "torch.aten.fill.Scalar"(%15885, %15886) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15888 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15889 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15890 = "torch.prim.ListConstruct"(%15888, %15889) : (!torch.int, !torch.int) -> !torch.list<int>
    %15891 = "torch.aten.repeat"(%15874, %15890) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %15892 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15893 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15894 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15895 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15896 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15897 = "torch.prim.ListConstruct"(%1483, %15892, %15893, %15894, %15895, %15896) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15898 = "torch.aten.view"(%15468, %15897) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15898, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15899 = "torch.prim.ListConstruct"(%15863, %15887, %15891, %15867) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %15900 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15901 = "torch.aten.index_put"(%15898, %15899, %15856, %15900) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15901, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15902 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %15903 = "torch.prim.ListConstruct"(%1483, %15902) : (!torch.int, !torch.int) -> !torch.list<int>
    %15904 = "torch.aten.view"(%15901, %15903) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15904, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %15905 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15906 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15907 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15908 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15909 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15910 = "torch.prim.ListConstruct"(%1483, %15905, %15906, %15907, %15908, %15909) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15911 = "torch.aten.view"(%15904, %15910) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15911, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15912 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15913 = "torch.aten.floor_divide.Scalar"(%arg64, %15912) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15915 = "torch.aten.unsqueeze"(%15913, %15914) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15916 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15917 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15918 = "torch.aten.gather"(%arg65, %15916, %15915, %15917) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15919 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15920 = "torch.aten.remainder.Scalar"(%arg64, %15919) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %15921 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15922 = "torch.aten.unsqueeze"(%15920, %15921) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15923 = "torch.constant.none"() : () -> !torch.none
    %15924 = "torch.aten.clone"(%1368, %15923) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %15925 = "torch.aten.detach"(%15924) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15926 = "torch.aten.detach"(%15925) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15927 = "torch.aten.detach"(%15926) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %15928 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15929 = "torch.aten.unsqueeze"(%15927, %15928) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %15930 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15931 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15932 = "torch.prim.ListConstruct"(%15930, %15931) : (!torch.int, !torch.int) -> !torch.list<int>
    %15933 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15935 = "torch.prim.ListConstruct"(%15933, %15934) : (!torch.int, !torch.int) -> !torch.list<int>
    %15936 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15937 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15938 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %15939 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15940 = "torch.aten.empty_strided"(%15932, %15935, %15936, %15937, %15938, %15939) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %15941 = "torch.constant.int"() <{value = 29 : i64}> : () -> !torch.int
    %15942 = "torch.aten.fill.Scalar"(%15940, %15941) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %15943 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15944 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15945 = "torch.prim.ListConstruct"(%15943, %15944) : (!torch.int, !torch.int) -> !torch.list<int>
    %15946 = "torch.aten.repeat"(%15929, %15945) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %15947 = "torch.prim.ListConstruct"(%15918, %15942, %15946, %15922) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %15948 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %15949 = "torch.aten.index_put"(%15911, %15947, %15808, %15948) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15949, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15950 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %15951 = "torch.prim.ListConstruct"(%1483, %15950) : (!torch.int, !torch.int) -> !torch.list<int>
    %15952 = "torch.aten.view"(%15949, %15951) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15952, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %15953 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %15954 = "torch.aten.mul.Scalar"(%arg65, %15953) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15954, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15955 = "torch.constant.int"() <{value = 58 : i64}> : () -> !torch.int
    %15956 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15957 = "torch.aten.add.Scalar"(%15954, %15955, %15956) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15957, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15958 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15959 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15960 = "torch.aten.add.Scalar"(%15957, %15958, %15959) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15960, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15961 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %15962 = "torch.aten.view"(%15960, %15961) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%15962, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %15963 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15964 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %15965 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15966 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15967 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15968 = "torch.prim.ListConstruct"(%1483, %15963, %15964, %15965, %15966, %15967) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15969 = "torch.aten.view"(%15952, %15968) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15969, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15970 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15971 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15972 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15973 = "torch.prim.ListConstruct"(%1914, %15970, %15971, %15972) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15974 = "torch.aten.view"(%15969, %15973) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15974, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15975 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15976 = "torch.aten.index_select"(%15974, %15975, %15962) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15976, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15977 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15978 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15979 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15980 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15981 = "torch.prim.ListConstruct"(%15977, %1481, %15978, %15979, %15980) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15982 = "torch.aten.view"(%15976, %15981) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15982, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15983 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15984 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15985 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15986 = "torch.prim.ListConstruct"(%15983, %1485, %15984, %15985) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15987 = "torch.aten.view"(%15982, %15986) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15987, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15988 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15989 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %15990 = "torch.aten.add.Scalar"(%15957, %15988, %15989) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%15990, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %15991 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %15992 = "torch.aten.view"(%15990, %15991) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%15992, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %15993 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %15994 = "torch.aten.index_select"(%15974, %15993, %15992) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%15994, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %15995 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %15996 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %15997 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %15998 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %15999 = "torch.prim.ListConstruct"(%15995, %1481, %15996, %15997, %15998) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16000 = "torch.aten.view"(%15994, %15999) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16000, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16001 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16002 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16003 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16004 = "torch.prim.ListConstruct"(%16001, %1485, %16002, %16003) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16005 = "torch.aten.view"(%16000, %16004) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16005, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16006 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16007 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16008 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16009 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16010 = "torch.aten.slice.Tensor"(%15987, %16006, %16007, %16008, %16009) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16010, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16011 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16012 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16013 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16014 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16015 = "torch.aten.slice.Tensor"(%16005, %16011, %16012, %16013, %16014) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16015, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16016 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %16017 = "torch.aten.unsqueeze"(%16010, %16016) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16017, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16018 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16019 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16020 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16021 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16022 = "torch.prim.ListConstruct"(%16018, %1485, %16019, %16020, %16021) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16023 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16024 = "torch.aten.expand"(%16017, %16022, %16023) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16024, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16025 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16026 = "torch.aten.clone"(%16024, %16025) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16026, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16027 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16028 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16029 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16030 = "torch.prim.ListConstruct"(%16027, %1485, %16028, %16029) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16031 = "torch.aten._unsafe_view"(%16026, %16030) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16031, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16032 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %16033 = "torch.aten.unsqueeze"(%16015, %16032) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16033, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16034 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16035 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16036 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16037 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16038 = "torch.prim.ListConstruct"(%16034, %1485, %16035, %16036, %16037) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16039 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16040 = "torch.aten.expand"(%16033, %16038, %16039) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16040, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16041 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16042 = "torch.aten.clone"(%16040, %16041) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16042, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16043 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16044 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16045 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16046 = "torch.prim.ListConstruct"(%16043, %1485, %16044, %16045) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16047 = "torch.aten._unsafe_view"(%16042, %16046) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16047, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16048 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16049 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16050 = "torch.aten.transpose.int"(%15832, %16048, %16049) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16051 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16052 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16053 = "torch.aten.transpose.int"(%16031, %16051, %16052) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16053, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16054 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16055 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16056 = "torch.aten.transpose.int"(%16047, %16054, %16055) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16056, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16057 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16058 = "torch.aten.squeeze.dim"(%1516, %16057) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16058, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %16059 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16060 = "torch.aten.squeeze.dim"(%16058, %16059) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16060, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %16061 = "torch_c.to_builtin_tensor"(%16050) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %16062 = "tensor.cast"(%16061) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %16063 = "torch_c.to_builtin_tensor"(%16053) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %16064 = "torch_c.to_builtin_tensor"(%16056) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %16065 = "torch_c.to_builtin_tensor"(%16060) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %16066 = "tensor.cast"(%16065) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %16067 = "torch_c.to_builtin_tensor"(%1370) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %16068 = "util.call"(%16062, %16063, %16064, %16067, %16066) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %16069 = "tensor.cast"(%16068) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %16070 = "torch_c.from_builtin_tensor"(%16069) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %16071 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16072 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16073 = "torch.aten.transpose.int"(%16070, %16071, %16072) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %16074 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16075 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16076 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16077 = "torch.prim.ListConstruct"(%16074, %16075, %16076) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16078 = "torch.aten.view"(%16073, %16077) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %16079 = "torch.aten.div.Tensor"(%16078, %1372) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16080 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16081 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16082 = "torch.aten.clamp"(%16079, %16080, %16081) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %16083 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16084 = "torch.prims.convert_element_type"(%16082, %16083) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16085 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16086 = "torch.aten.unsqueeze"(%1374, %16085) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %16087 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16088 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16089 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16090 = "torch.prim.ListConstruct"(%16087, %16088, %16089) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16091 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16092 = "torch.aten.expand"(%16086, %16090, %16091) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %16093 = "torch_c.to_builtin_tensor"(%16084) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16094 = "torch_c.to_builtin_tensor"(%16092) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %16095 = "util.call"(%16093, %16094) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %16096 = "torch_c.from_builtin_tensor"(%16095) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %16097 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16098 = "torch.prims.convert_element_type"(%16096, %16097) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16099 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16100 = "torch.aten.add.Tensor"(%15699, %16098, %16099) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16101 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %16102 = "torch.prims.convert_element_type"(%16100, %16101) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16103 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16104 = "torch.aten.pow.Tensor_Scalar"(%16102, %16103) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16105 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16106 = "torch.prim.ListConstruct"(%16105) : (!torch.int) -> !torch.list<int>
    %16107 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %16108 = "torch.constant.none"() : () -> !torch.none
    %16109 = "torch.aten.mean.dim"(%16104, %16106, %16107, %16108) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %16110 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %16111 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16112 = "torch.aten.add.Scalar"(%16109, %16110, %16111) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %16113 = "torch.aten.rsqrt"(%16112) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %16114 = "torch.aten.mul.Tensor"(%16102, %16113) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16115 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16116 = "torch.prims.convert_element_type"(%16114, %16115) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16117 = "torch.aten.mul.Tensor"(%1376, %16116) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %16118 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16119 = "torch.prims.convert_element_type"(%16117, %16118) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16120 = "torch.aten.div.Tensor"(%16119, %1378) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16121 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16122 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16123 = "torch.aten.clamp"(%16120, %16121, %16122) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16124 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16125 = "torch.prims.convert_element_type"(%16123, %16124) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16126 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16127 = "torch.aten.unsqueeze"(%1380, %16126) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %16128 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16129 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %16130 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16131 = "torch.prim.ListConstruct"(%16128, %16129, %16130) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16132 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16133 = "torch.aten.expand"(%16127, %16131, %16132) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %16134 = "torch_c.to_builtin_tensor"(%16125) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16135 = "torch_c.to_builtin_tensor"(%16133) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %16136 = "util.call"(%16134, %16135) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %16137 = "torch_c.from_builtin_tensor"(%16136) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %16138 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16139 = "torch.prims.convert_element_type"(%16137, %16138) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %16140 = "torch.aten.silu"(%16139) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %16141 = "torch.aten.div.Tensor"(%16119, %1382) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16142 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16143 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16144 = "torch.aten.clamp"(%16141, %16142, %16143) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16145 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16146 = "torch.prims.convert_element_type"(%16144, %16145) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16147 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16148 = "torch.aten.unsqueeze"(%1384, %16147) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %16149 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16150 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %16151 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16152 = "torch.prim.ListConstruct"(%16149, %16150, %16151) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16153 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16154 = "torch.aten.expand"(%16148, %16152, %16153) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %16155 = "torch_c.to_builtin_tensor"(%16146) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16156 = "torch_c.to_builtin_tensor"(%16154) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %16157 = "util.call"(%16155, %16156) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %16158 = "torch_c.from_builtin_tensor"(%16157) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %16159 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16160 = "torch.prims.convert_element_type"(%16158, %16159) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %16161 = "torch.aten.mul.Tensor"(%16140, %16160) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %16162 = "torch.aten.div.Tensor"(%16161, %1386) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %16163 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16164 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16165 = "torch.aten.clamp"(%16162, %16163, %16164) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %16166 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16167 = "torch.prims.convert_element_type"(%16165, %16166) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %16168 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16169 = "torch.aten.unsqueeze"(%1388, %16168) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %16170 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16171 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16172 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %16173 = "torch.prim.ListConstruct"(%16170, %16171, %16172) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16174 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16175 = "torch.aten.expand"(%16169, %16173, %16174) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %16176 = "torch_c.to_builtin_tensor"(%16167) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %16177 = "torch_c.to_builtin_tensor"(%16175) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %16178 = "util.call"(%16176, %16177) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %16179 = "torch_c.from_builtin_tensor"(%16178) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %16180 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16181 = "torch.prims.convert_element_type"(%16179, %16180) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16183 = "torch.aten.add.Tensor"(%16100, %16181, %16182) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16184 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %16185 = "torch.prims.convert_element_type"(%16183, %16184) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16186 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16187 = "torch.aten.pow.Tensor_Scalar"(%16185, %16186) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16188 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16189 = "torch.prim.ListConstruct"(%16188) : (!torch.int) -> !torch.list<int>
    %16190 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %16191 = "torch.constant.none"() : () -> !torch.none
    %16192 = "torch.aten.mean.dim"(%16187, %16189, %16190, %16191) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %16193 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %16194 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16195 = "torch.aten.add.Scalar"(%16192, %16193, %16194) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %16196 = "torch.aten.rsqrt"(%16195) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %16197 = "torch.aten.mul.Tensor"(%16185, %16196) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16198 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16199 = "torch.prims.convert_element_type"(%16197, %16198) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16200 = "torch.aten.mul.Tensor"(%1390, %16199) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %16201 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16202 = "torch.prims.convert_element_type"(%16200, %16201) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16203 = "torch.aten.div.Tensor"(%16202, %1392) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16204 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16205 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16206 = "torch.aten.clamp"(%16203, %16204, %16205) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16207 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16208 = "torch.prims.convert_element_type"(%16206, %16207) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16209 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16210 = "torch.aten.unsqueeze"(%1394, %16209) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %16211 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16212 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16213 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16214 = "torch.prim.ListConstruct"(%16211, %16212, %16213) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16215 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16216 = "torch.aten.expand"(%16210, %16214, %16215) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %16217 = "torch_c.to_builtin_tensor"(%16208) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16218 = "torch_c.to_builtin_tensor"(%16216) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %16219 = "util.call"(%16217, %16218) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %16220 = "torch_c.from_builtin_tensor"(%16219) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %16221 = "torch.aten.div.Tensor"(%16220, %1396) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16222 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16223 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16224 = "torch.aten.clamp"(%16221, %16222, %16223) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %16225 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16226 = "torch.prims.convert_element_type"(%16224, %16225) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16227 = "torch.aten.div.Tensor"(%16202, %1398) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16228 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16229 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16230 = "torch.aten.clamp"(%16227, %16228, %16229) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16231 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16232 = "torch.prims.convert_element_type"(%16230, %16231) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16233 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16234 = "torch.aten.unsqueeze"(%1400, %16233) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %16235 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16236 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %16237 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16238 = "torch.prim.ListConstruct"(%16235, %16236, %16237) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16239 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16240 = "torch.aten.expand"(%16234, %16238, %16239) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %16241 = "torch_c.to_builtin_tensor"(%16232) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16242 = "torch_c.to_builtin_tensor"(%16240) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %16243 = "util.call"(%16241, %16242) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %16244 = "torch_c.from_builtin_tensor"(%16243) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %16245 = "torch.aten.div.Tensor"(%16244, %1402) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %16246 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16247 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16248 = "torch.aten.clamp"(%16245, %16246, %16247) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %16249 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16250 = "torch.prims.convert_element_type"(%16248, %16249) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %16251 = "torch.aten.div.Tensor"(%16202, %1404) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16252 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16253 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16254 = "torch.aten.clamp"(%16251, %16252, %16253) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16255 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16256 = "torch.prims.convert_element_type"(%16254, %16255) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16257 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16258 = "torch.aten.unsqueeze"(%1406, %16257) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %16259 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16260 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %16261 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16262 = "torch.prim.ListConstruct"(%16259, %16260, %16261) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16263 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16264 = "torch.aten.expand"(%16258, %16262, %16263) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %16265 = "torch_c.to_builtin_tensor"(%16256) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16266 = "torch_c.to_builtin_tensor"(%16264) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %16267 = "util.call"(%16265, %16266) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %16268 = "torch_c.from_builtin_tensor"(%16267) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %16269 = "torch.aten.div.Tensor"(%16268, %1408) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %16270 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16271 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16272 = "torch.aten.clamp"(%16269, %16270, %16271) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %16273 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16274 = "torch.prims.convert_element_type"(%16272, %16273) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %16275 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16276 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16277 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16278 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16279 = "torch.prim.ListConstruct"(%16275, %16276, %16277, %16278) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16280 = "torch.aten.view"(%16226, %16279) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %16281 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16282 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16283 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16284 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16285 = "torch.prim.ListConstruct"(%16281, %16282, %16283, %16284) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16286 = "torch.aten.view"(%16250, %16285) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %16287 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16288 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16289 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16290 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16291 = "torch.prim.ListConstruct"(%16287, %16288, %16289, %16290) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16292 = "torch.aten.view"(%16274, %16291) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %16293 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16294 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16295 = "torch.aten.transpose.int"(%16280, %16293, %16294) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16296 = "torch.aten.mul.Tensor"(%16295, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16297 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16298 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16299 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16300 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16301 = "torch.aten.slice.Tensor"(%16295, %16297, %16298, %16299, %16300) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %16302 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16303 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16304 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16305 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16306 = "torch.aten.slice.Tensor"(%16295, %16302, %16303, %16304, %16305) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %16307 = "torch.aten.neg"(%16306) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %16308 = "torch.prim.ListConstruct"(%16307, %16301) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %16309 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16310 = "torch.aten.cat"(%16308, %16309) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16311 = "torch.aten.mul.Tensor"(%16310, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16312 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16313 = "torch.aten.add.Tensor"(%16296, %16311, %16312) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16314 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16315 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16316 = "torch.aten.transpose.int"(%16313, %16314, %16315) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %16317 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16318 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16319 = "torch.aten.transpose.int"(%16286, %16317, %16318) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16320 = "torch.aten.mul.Tensor"(%16319, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16321 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16322 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16323 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16324 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16325 = "torch.aten.slice.Tensor"(%16319, %16321, %16322, %16323, %16324) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %16326 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16327 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16328 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16329 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16330 = "torch.aten.slice.Tensor"(%16319, %16326, %16327, %16328, %16329) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %16331 = "torch.aten.neg"(%16330) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %16332 = "torch.prim.ListConstruct"(%16331, %16325) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %16333 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16334 = "torch.aten.cat"(%16332, %16333) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16335 = "torch.aten.mul.Tensor"(%16334, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16336 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16337 = "torch.aten.add.Tensor"(%16320, %16335, %16336) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16338 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16339 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16340 = "torch.aten.transpose.int"(%16337, %16338, %16339) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %16341 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16342 = "torch.aten.floor_divide.Scalar"(%arg64, %16341) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16343 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16344 = "torch.aten.unsqueeze"(%16342, %16343) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16345 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16346 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16347 = "torch.aten.gather"(%arg65, %16345, %16344, %16346) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16348 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16349 = "torch.aten.remainder.Scalar"(%arg64, %16348) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16350 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16351 = "torch.aten.unsqueeze"(%16349, %16350) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16352 = "torch.constant.none"() : () -> !torch.none
    %16353 = "torch.aten.clone"(%1409, %16352) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %16354 = "torch.aten.detach"(%16353) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16355 = "torch.aten.detach"(%16354) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16356 = "torch.aten.detach"(%16355) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16357 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16358 = "torch.aten.unsqueeze"(%16356, %16357) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %16359 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16360 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16361 = "torch.prim.ListConstruct"(%16359, %16360) : (!torch.int, !torch.int) -> !torch.list<int>
    %16362 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16363 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16364 = "torch.prim.ListConstruct"(%16362, %16363) : (!torch.int, !torch.int) -> !torch.list<int>
    %16365 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16366 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16367 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %16368 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16369 = "torch.aten.empty_strided"(%16361, %16364, %16365, %16366, %16367, %16368) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16370 = "torch.constant.int"() <{value = 30 : i64}> : () -> !torch.int
    %16371 = "torch.aten.fill.Scalar"(%16369, %16370) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16372 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16373 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16374 = "torch.prim.ListConstruct"(%16372, %16373) : (!torch.int, !torch.int) -> !torch.list<int>
    %16375 = "torch.aten.repeat"(%16358, %16374) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %16376 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16377 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16378 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16379 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16380 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16381 = "torch.prim.ListConstruct"(%1483, %16376, %16377, %16378, %16379, %16380) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16382 = "torch.aten.view"(%15952, %16381) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16382, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16383 = "torch.prim.ListConstruct"(%16347, %16371, %16375, %16351) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %16384 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16385 = "torch.aten.index_put"(%16382, %16383, %16340, %16384) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16385, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16386 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %16387 = "torch.prim.ListConstruct"(%1483, %16386) : (!torch.int, !torch.int) -> !torch.list<int>
    %16388 = "torch.aten.view"(%16385, %16387) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16388, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %16389 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16390 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16391 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16392 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16393 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16394 = "torch.prim.ListConstruct"(%1483, %16389, %16390, %16391, %16392, %16393) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16395 = "torch.aten.view"(%16388, %16394) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16395, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16396 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16397 = "torch.aten.floor_divide.Scalar"(%arg64, %16396) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16398 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16399 = "torch.aten.unsqueeze"(%16397, %16398) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16400 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16401 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16402 = "torch.aten.gather"(%arg65, %16400, %16399, %16401) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16403 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16404 = "torch.aten.remainder.Scalar"(%arg64, %16403) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16405 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16406 = "torch.aten.unsqueeze"(%16404, %16405) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16407 = "torch.constant.none"() : () -> !torch.none
    %16408 = "torch.aten.clone"(%1410, %16407) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %16409 = "torch.aten.detach"(%16408) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16410 = "torch.aten.detach"(%16409) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16411 = "torch.aten.detach"(%16410) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16412 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16413 = "torch.aten.unsqueeze"(%16411, %16412) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %16414 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16415 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16416 = "torch.prim.ListConstruct"(%16414, %16415) : (!torch.int, !torch.int) -> !torch.list<int>
    %16417 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16418 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16419 = "torch.prim.ListConstruct"(%16417, %16418) : (!torch.int, !torch.int) -> !torch.list<int>
    %16420 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16421 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16422 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %16423 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16424 = "torch.aten.empty_strided"(%16416, %16419, %16420, %16421, %16422, %16423) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16425 = "torch.constant.int"() <{value = 30 : i64}> : () -> !torch.int
    %16426 = "torch.aten.fill.Scalar"(%16424, %16425) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16427 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16428 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16429 = "torch.prim.ListConstruct"(%16427, %16428) : (!torch.int, !torch.int) -> !torch.list<int>
    %16430 = "torch.aten.repeat"(%16413, %16429) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %16431 = "torch.prim.ListConstruct"(%16402, %16426, %16430, %16406) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %16432 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16433 = "torch.aten.index_put"(%16395, %16431, %16292, %16432) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16433, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16434 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %16435 = "torch.prim.ListConstruct"(%1483, %16434) : (!torch.int, !torch.int) -> !torch.list<int>
    %16436 = "torch.aten.view"(%16433, %16435) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16436, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %16437 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16438 = "torch.aten.mul.Scalar"(%arg65, %16437) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16438, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16439 = "torch.constant.int"() <{value = 60 : i64}> : () -> !torch.int
    %16440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16441 = "torch.aten.add.Scalar"(%16438, %16439, %16440) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16441, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16442 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16443 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16444 = "torch.aten.add.Scalar"(%16441, %16442, %16443) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16444, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16445 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %16446 = "torch.aten.view"(%16444, %16445) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%16446, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %16447 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16448 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16449 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16450 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16451 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16452 = "torch.prim.ListConstruct"(%1483, %16447, %16448, %16449, %16450, %16451) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16453 = "torch.aten.view"(%16436, %16452) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16453, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16454 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16455 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16456 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16457 = "torch.prim.ListConstruct"(%1914, %16454, %16455, %16456) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16458 = "torch.aten.view"(%16453, %16457) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16458, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16459 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16460 = "torch.aten.index_select"(%16458, %16459, %16446) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16460, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16461 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16462 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16463 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16464 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16465 = "torch.prim.ListConstruct"(%16461, %1481, %16462, %16463, %16464) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16466 = "torch.aten.view"(%16460, %16465) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16466, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16467 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16468 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16469 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16470 = "torch.prim.ListConstruct"(%16467, %1485, %16468, %16469) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16471 = "torch.aten.view"(%16466, %16470) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16471, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16472 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16473 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16474 = "torch.aten.add.Scalar"(%16441, %16472, %16473) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16474, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16475 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %16476 = "torch.aten.view"(%16474, %16475) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%16476, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %16477 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16478 = "torch.aten.index_select"(%16458, %16477, %16476) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16478, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16479 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16480 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16481 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16482 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16483 = "torch.prim.ListConstruct"(%16479, %1481, %16480, %16481, %16482) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16484 = "torch.aten.view"(%16478, %16483) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16484, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16485 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16486 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16487 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16488 = "torch.prim.ListConstruct"(%16485, %1485, %16486, %16487) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16489 = "torch.aten.view"(%16484, %16488) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16489, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16490 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16491 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16492 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16493 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16494 = "torch.aten.slice.Tensor"(%16471, %16490, %16491, %16492, %16493) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16494, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16495 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16496 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16497 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16498 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16499 = "torch.aten.slice.Tensor"(%16489, %16495, %16496, %16497, %16498) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16499, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16500 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %16501 = "torch.aten.unsqueeze"(%16494, %16500) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16501, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16502 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16503 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16504 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16505 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16506 = "torch.prim.ListConstruct"(%16502, %1485, %16503, %16504, %16505) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16507 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16508 = "torch.aten.expand"(%16501, %16506, %16507) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16508, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16509 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16510 = "torch.aten.clone"(%16508, %16509) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16510, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16511 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16512 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16513 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16514 = "torch.prim.ListConstruct"(%16511, %1485, %16512, %16513) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16515 = "torch.aten._unsafe_view"(%16510, %16514) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16515, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16516 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %16517 = "torch.aten.unsqueeze"(%16499, %16516) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16517, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16518 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16519 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16520 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16521 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16522 = "torch.prim.ListConstruct"(%16518, %1485, %16519, %16520, %16521) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16523 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16524 = "torch.aten.expand"(%16517, %16522, %16523) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16524, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16525 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16526 = "torch.aten.clone"(%16524, %16525) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16526, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16527 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16528 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16529 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16530 = "torch.prim.ListConstruct"(%16527, %1485, %16528, %16529) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16531 = "torch.aten._unsafe_view"(%16526, %16530) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16531, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16532 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16533 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16534 = "torch.aten.transpose.int"(%16316, %16532, %16533) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16536 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16537 = "torch.aten.transpose.int"(%16515, %16535, %16536) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16537, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16538 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16539 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16540 = "torch.aten.transpose.int"(%16531, %16538, %16539) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16540, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16541 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16542 = "torch.aten.squeeze.dim"(%1516, %16541) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16542, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %16543 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16544 = "torch.aten.squeeze.dim"(%16542, %16543) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16544, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %16545 = "torch_c.to_builtin_tensor"(%16534) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %16546 = "tensor.cast"(%16545) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %16547 = "torch_c.to_builtin_tensor"(%16537) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %16548 = "torch_c.to_builtin_tensor"(%16540) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %16549 = "torch_c.to_builtin_tensor"(%16544) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %16550 = "tensor.cast"(%16549) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %16551 = "torch_c.to_builtin_tensor"(%1412) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %16552 = "util.call"(%16546, %16547, %16548, %16551, %16550) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %16553 = "tensor.cast"(%16552) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %16554 = "torch_c.from_builtin_tensor"(%16553) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %16555 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16556 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16557 = "torch.aten.transpose.int"(%16554, %16555, %16556) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %16558 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16559 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16560 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16561 = "torch.prim.ListConstruct"(%16558, %16559, %16560) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16562 = "torch.aten.view"(%16557, %16561) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %16563 = "torch.aten.div.Tensor"(%16562, %1414) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16564 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16565 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16566 = "torch.aten.clamp"(%16563, %16564, %16565) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %16567 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16568 = "torch.prims.convert_element_type"(%16566, %16567) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16569 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16570 = "torch.aten.unsqueeze"(%1416, %16569) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %16571 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16572 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16573 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16574 = "torch.prim.ListConstruct"(%16571, %16572, %16573) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16575 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16576 = "torch.aten.expand"(%16570, %16574, %16575) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %16577 = "torch_c.to_builtin_tensor"(%16568) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16578 = "torch_c.to_builtin_tensor"(%16576) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %16579 = "util.call"(%16577, %16578) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %16580 = "torch_c.from_builtin_tensor"(%16579) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %16581 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16582 = "torch.prims.convert_element_type"(%16580, %16581) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16583 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16584 = "torch.aten.add.Tensor"(%16183, %16582, %16583) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16585 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %16586 = "torch.prims.convert_element_type"(%16584, %16585) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16587 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16588 = "torch.aten.pow.Tensor_Scalar"(%16586, %16587) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16589 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16590 = "torch.prim.ListConstruct"(%16589) : (!torch.int) -> !torch.list<int>
    %16591 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %16592 = "torch.constant.none"() : () -> !torch.none
    %16593 = "torch.aten.mean.dim"(%16588, %16590, %16591, %16592) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %16594 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %16595 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16596 = "torch.aten.add.Scalar"(%16593, %16594, %16595) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %16597 = "torch.aten.rsqrt"(%16596) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %16598 = "torch.aten.mul.Tensor"(%16586, %16597) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16599 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16600 = "torch.prims.convert_element_type"(%16598, %16599) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16601 = "torch.aten.mul.Tensor"(%1418, %16600) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %16602 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16603 = "torch.prims.convert_element_type"(%16601, %16602) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16604 = "torch.aten.div.Tensor"(%16603, %1420) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16605 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16606 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16607 = "torch.aten.clamp"(%16604, %16605, %16606) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16608 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16609 = "torch.prims.convert_element_type"(%16607, %16608) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16610 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16611 = "torch.aten.unsqueeze"(%1422, %16610) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %16612 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16613 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %16614 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16615 = "torch.prim.ListConstruct"(%16612, %16613, %16614) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16616 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16617 = "torch.aten.expand"(%16611, %16615, %16616) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %16618 = "torch_c.to_builtin_tensor"(%16609) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16619 = "torch_c.to_builtin_tensor"(%16617) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %16620 = "util.call"(%16618, %16619) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %16621 = "torch_c.from_builtin_tensor"(%16620) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %16622 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16623 = "torch.prims.convert_element_type"(%16621, %16622) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %16624 = "torch.aten.silu"(%16623) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %16625 = "torch.aten.div.Tensor"(%16603, %1424) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16626 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16627 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16628 = "torch.aten.clamp"(%16625, %16626, %16627) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16629 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16630 = "torch.prims.convert_element_type"(%16628, %16629) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16631 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16632 = "torch.aten.unsqueeze"(%1426, %16631) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %16633 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16634 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %16635 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16636 = "torch.prim.ListConstruct"(%16633, %16634, %16635) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16637 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16638 = "torch.aten.expand"(%16632, %16636, %16637) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %16639 = "torch_c.to_builtin_tensor"(%16630) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16640 = "torch_c.to_builtin_tensor"(%16638) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %16641 = "util.call"(%16639, %16640) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %16642 = "torch_c.from_builtin_tensor"(%16641) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %16643 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16644 = "torch.prims.convert_element_type"(%16642, %16643) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %16645 = "torch.aten.mul.Tensor"(%16624, %16644) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %16646 = "torch.aten.div.Tensor"(%16645, %1428) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %16647 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16648 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16649 = "torch.aten.clamp"(%16646, %16647, %16648) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %16650 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16651 = "torch.prims.convert_element_type"(%16649, %16650) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %16652 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16653 = "torch.aten.unsqueeze"(%1430, %16652) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %16654 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16655 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16656 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %16657 = "torch.prim.ListConstruct"(%16654, %16655, %16656) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16658 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16659 = "torch.aten.expand"(%16653, %16657, %16658) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %16660 = "torch_c.to_builtin_tensor"(%16651) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %16661 = "torch_c.to_builtin_tensor"(%16659) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %16662 = "util.call"(%16660, %16661) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %16663 = "torch_c.from_builtin_tensor"(%16662) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %16664 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16665 = "torch.prims.convert_element_type"(%16663, %16664) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16667 = "torch.aten.add.Tensor"(%16584, %16665, %16666) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16668 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %16669 = "torch.prims.convert_element_type"(%16667, %16668) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16670 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16671 = "torch.aten.pow.Tensor_Scalar"(%16669, %16670) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %16672 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16673 = "torch.prim.ListConstruct"(%16672) : (!torch.int) -> !torch.list<int>
    %16674 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %16675 = "torch.constant.none"() : () -> !torch.none
    %16676 = "torch.aten.mean.dim"(%16671, %16673, %16674, %16675) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %16677 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %16678 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16679 = "torch.aten.add.Scalar"(%16676, %16677, %16678) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %16680 = "torch.aten.rsqrt"(%16679) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %16681 = "torch.aten.mul.Tensor"(%16669, %16680) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16682 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16683 = "torch.prims.convert_element_type"(%16681, %16682) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16684 = "torch.aten.mul.Tensor"(%1432, %16683) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %16685 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %16686 = "torch.prims.convert_element_type"(%16684, %16685) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %16687 = "torch.aten.div.Tensor"(%16686, %1434) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16688 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16689 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16690 = "torch.aten.clamp"(%16687, %16688, %16689) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16691 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16692 = "torch.prims.convert_element_type"(%16690, %16691) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16693 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16694 = "torch.aten.unsqueeze"(%1436, %16693) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %16695 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16696 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16697 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16698 = "torch.prim.ListConstruct"(%16695, %16696, %16697) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16699 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16700 = "torch.aten.expand"(%16694, %16698, %16699) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %16701 = "torch_c.to_builtin_tensor"(%16692) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16702 = "torch_c.to_builtin_tensor"(%16700) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %16703 = "util.call"(%16701, %16702) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %16704 = "torch_c.from_builtin_tensor"(%16703) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %16705 = "torch.aten.div.Tensor"(%16704, %1438) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %16706 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16707 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16708 = "torch.aten.clamp"(%16705, %16706, %16707) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %16709 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16710 = "torch.prims.convert_element_type"(%16708, %16709) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16711 = "torch.aten.div.Tensor"(%16686, %1440) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16712 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16713 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16714 = "torch.aten.clamp"(%16711, %16712, %16713) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16715 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16716 = "torch.prims.convert_element_type"(%16714, %16715) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16717 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16718 = "torch.aten.unsqueeze"(%1442, %16717) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %16719 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16720 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %16721 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16722 = "torch.prim.ListConstruct"(%16719, %16720, %16721) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16723 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16724 = "torch.aten.expand"(%16718, %16722, %16723) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %16725 = "torch_c.to_builtin_tensor"(%16716) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16726 = "torch_c.to_builtin_tensor"(%16724) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %16727 = "util.call"(%16725, %16726) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %16728 = "torch_c.from_builtin_tensor"(%16727) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %16729 = "torch.aten.div.Tensor"(%16728, %1444) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %16730 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16731 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16732 = "torch.aten.clamp"(%16729, %16730, %16731) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %16733 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16734 = "torch.prims.convert_element_type"(%16732, %16733) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %16735 = "torch.aten.div.Tensor"(%16686, %1446) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %16736 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16737 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16738 = "torch.aten.clamp"(%16735, %16736, %16737) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %16739 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16740 = "torch.prims.convert_element_type"(%16738, %16739) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %16741 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16742 = "torch.aten.unsqueeze"(%1448, %16741) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %16743 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16744 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %16745 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %16746 = "torch.prim.ListConstruct"(%16743, %16744, %16745) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16747 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16748 = "torch.aten.expand"(%16742, %16746, %16747) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %16749 = "torch_c.to_builtin_tensor"(%16740) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %16750 = "torch_c.to_builtin_tensor"(%16748) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %16751 = "util.call"(%16749, %16750) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>
    %16752 = "torch_c.from_builtin_tensor"(%16751) : (tensor<4x1x1024xf32>) -> !torch.vtensor<[4,1,1024],f32>
    %16753 = "torch.aten.div.Tensor"(%16752, %1450) : (!torch.vtensor<[4,1,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,1024],f32>
    %16754 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %16755 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %16756 = "torch.aten.clamp"(%16753, %16754, %16755) : (!torch.vtensor<[4,1,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,1024],f32>
    %16757 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %16758 = "torch.prims.convert_element_type"(%16756, %16757) : (!torch.vtensor<[4,1,1024],f32>, !torch.int) -> !torch.vtensor<[4,1,1024],f8E4M3FNUZ>
    %16759 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16761 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16762 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16763 = "torch.prim.ListConstruct"(%16759, %16760, %16761, %16762) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16764 = "torch.aten.view"(%16710, %16763) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %16765 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16766 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16767 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16768 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16769 = "torch.prim.ListConstruct"(%16765, %16766, %16767, %16768) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16770 = "torch.aten.view"(%16734, %16769) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %16771 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16772 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16773 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16774 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16775 = "torch.prim.ListConstruct"(%16771, %16772, %16773, %16774) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16776 = "torch.aten.view"(%16758, %16775) : (!torch.vtensor<[4,1,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %16777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16778 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16779 = "torch.aten.transpose.int"(%16764, %16777, %16778) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16780 = "torch.aten.mul.Tensor"(%16779, %1637) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16781 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16782 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16783 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16784 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16785 = "torch.aten.slice.Tensor"(%16779, %16781, %16782, %16783, %16784) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %16786 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16787 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16788 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16789 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16790 = "torch.aten.slice.Tensor"(%16779, %16786, %16787, %16788, %16789) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %16791 = "torch.aten.neg"(%16790) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>
    %16792 = "torch.prim.ListConstruct"(%16791, %16785) : (!torch.vtensor<[4,32,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %16793 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16794 = "torch.aten.cat"(%16792, %16793) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16795 = "torch.aten.mul.Tensor"(%16794, %1651) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16796 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16797 = "torch.aten.add.Tensor"(%16780, %16795, %16796) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %16798 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16799 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16800 = "torch.aten.transpose.int"(%16797, %16798, %16799) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f8E4M3FNUZ>
    %16801 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16802 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16803 = "torch.aten.transpose.int"(%16770, %16801, %16802) : (!torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16804 = "torch.aten.mul.Tensor"(%16803, %1637) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16805 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16806 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16807 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16808 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16809 = "torch.aten.slice.Tensor"(%16803, %16805, %16806, %16807, %16808) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %16810 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %16811 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16812 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16813 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16814 = "torch.aten.slice.Tensor"(%16803, %16810, %16811, %16812, %16813) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %16815 = "torch.aten.neg"(%16814) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>
    %16816 = "torch.prim.ListConstruct"(%16815, %16809) : (!torch.vtensor<[4,8,1,64],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %16817 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %16818 = "torch.aten.cat"(%16816, %16817) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16819 = "torch.aten.mul.Tensor"(%16818, %1651) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,1,1,128],bf16>) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16820 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16821 = "torch.aten.add.Tensor"(%16804, %16819, %16820) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,8,1,128],f8E4M3FNUZ>
    %16822 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16823 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16824 = "torch.aten.transpose.int"(%16821, %16822, %16823) : (!torch.vtensor<[4,8,1,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>
    %16825 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16826 = "torch.aten.floor_divide.Scalar"(%arg64, %16825) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16827 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16828 = "torch.aten.unsqueeze"(%16826, %16827) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16829 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16830 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16831 = "torch.aten.gather"(%arg65, %16829, %16828, %16830) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16832 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16833 = "torch.aten.remainder.Scalar"(%arg64, %16832) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16834 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16835 = "torch.aten.unsqueeze"(%16833, %16834) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16836 = "torch.constant.none"() : () -> !torch.none
    %16837 = "torch.aten.clone"(%1451, %16836) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %16838 = "torch.aten.detach"(%16837) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16839 = "torch.aten.detach"(%16838) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16840 = "torch.aten.detach"(%16839) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16841 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16842 = "torch.aten.unsqueeze"(%16840, %16841) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %16843 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16844 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16845 = "torch.prim.ListConstruct"(%16843, %16844) : (!torch.int, !torch.int) -> !torch.list<int>
    %16846 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16847 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16848 = "torch.prim.ListConstruct"(%16846, %16847) : (!torch.int, !torch.int) -> !torch.list<int>
    %16849 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16850 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16851 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %16852 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16853 = "torch.aten.empty_strided"(%16845, %16848, %16849, %16850, %16851, %16852) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16854 = "torch.constant.int"() <{value = 31 : i64}> : () -> !torch.int
    %16855 = "torch.aten.fill.Scalar"(%16853, %16854) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16856 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16857 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16858 = "torch.prim.ListConstruct"(%16856, %16857) : (!torch.int, !torch.int) -> !torch.list<int>
    %16859 = "torch.aten.repeat"(%16842, %16858) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %16860 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16861 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16862 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16863 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16864 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16865 = "torch.prim.ListConstruct"(%1483, %16860, %16861, %16862, %16863, %16864) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16866 = "torch.aten.view"(%16436, %16865) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16866, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16867 = "torch.prim.ListConstruct"(%16831, %16855, %16859, %16835) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %16868 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16869 = "torch.aten.index_put"(%16866, %16867, %16824, %16868) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16869, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16870 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %16871 = "torch.prim.ListConstruct"(%1483, %16870) : (!torch.int, !torch.int) -> !torch.list<int>
    %16872 = "torch.aten.view"(%16869, %16871) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16872, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %16873 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16874 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16875 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16876 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16877 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16878 = "torch.prim.ListConstruct"(%1483, %16873, %16874, %16875, %16876, %16877) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16879 = "torch.aten.view"(%16872, %16878) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16879, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16880 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16881 = "torch.aten.floor_divide.Scalar"(%arg64, %16880) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16882 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16883 = "torch.aten.unsqueeze"(%16881, %16882) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16884 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16885 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16886 = "torch.aten.gather"(%arg65, %16884, %16883, %16885) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.vtensor<[4,1],si64>, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16887 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16888 = "torch.aten.remainder.Scalar"(%arg64, %16887) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4],si64>
    %16889 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16890 = "torch.aten.unsqueeze"(%16888, %16889) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16891 = "torch.constant.none"() : () -> !torch.none
    %16892 = "torch.aten.clone"(%1452, %16891) : (!torch.vtensor<[],si64>, !torch.none) -> !torch.vtensor<[],si64>
    %16893 = "torch.aten.detach"(%16892) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16894 = "torch.aten.detach"(%16893) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16895 = "torch.aten.detach"(%16894) : (!torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>
    %16896 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16897 = "torch.aten.unsqueeze"(%16895, %16896) : (!torch.vtensor<[],si64>, !torch.int) -> !torch.vtensor<[1],si64>
    %16898 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16899 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16900 = "torch.prim.ListConstruct"(%16898, %16899) : (!torch.int, !torch.int) -> !torch.list<int>
    %16901 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16902 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16903 = "torch.prim.ListConstruct"(%16901, %16902) : (!torch.int, !torch.int) -> !torch.list<int>
    %16904 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16905 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16906 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %16907 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16908 = "torch.aten.empty_strided"(%16900, %16903, %16904, %16905, %16906, %16907) : (!torch.list<int>, !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool) -> !torch.vtensor<[4,1],si64>
    %16909 = "torch.constant.int"() <{value = 31 : i64}> : () -> !torch.int
    %16910 = "torch.aten.fill.Scalar"(%16908, %16909) : (!torch.vtensor<[4,1],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %16911 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16912 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16913 = "torch.prim.ListConstruct"(%16911, %16912) : (!torch.int, !torch.int) -> !torch.list<int>
    %16914 = "torch.aten.repeat"(%16897, %16913) : (!torch.vtensor<[1],si64>, !torch.list<int>) -> !torch.vtensor<[4,1],si64>
    %16915 = "torch.prim.ListConstruct"(%16886, %16910, %16914, %16890) : (!torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>, !torch.vtensor<[4,1],si64>) -> !torch.list<optional<vtensor>>
    %16916 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16917 = "torch.aten.index_put"(%16879, %16915, %16776, %16916) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[4,1,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16917, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16918 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %16919 = "torch.prim.ListConstruct"(%1483, %16918) : (!torch.int, !torch.int) -> !torch.list<int>
    %16920 = "torch.aten.view"(%16917, %16919) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.overwrite.tensor.contents"(%16920, %arg66) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> ()
    "torch.bind_symbolic_shape"(%16920, %1479) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %16921 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %16922 = "torch.aten.mul.Scalar"(%arg65, %16921) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16922, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16923 = "torch.constant.int"() <{value = 62 : i64}> : () -> !torch.int
    %16924 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16925 = "torch.aten.add.Scalar"(%16922, %16923, %16924) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16925, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16926 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16928 = "torch.aten.add.Scalar"(%16925, %16926, %16927) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16928, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16929 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %16930 = "torch.aten.view"(%16928, %16929) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%16930, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %16931 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16932 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %16933 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16934 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16935 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16936 = "torch.prim.ListConstruct"(%1483, %16931, %16932, %16933, %16934, %16935) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16937 = "torch.aten.view"(%16920, %16936) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16937, %1479) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16938 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16939 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16940 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16941 = "torch.prim.ListConstruct"(%1914, %16938, %16939, %16940) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16942 = "torch.aten.view"(%16937, %16941) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16942, %1479) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16943 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16944 = "torch.aten.index_select"(%16942, %16943, %16930) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16944, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16945 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16946 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16947 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16948 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16949 = "torch.prim.ListConstruct"(%16945, %1481, %16946, %16947, %16948) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16950 = "torch.aten.view"(%16944, %16949) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16950, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16951 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16952 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16953 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16954 = "torch.prim.ListConstruct"(%16951, %1485, %16952, %16953) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16955 = "torch.aten.view"(%16950, %16954) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16955, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16956 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16957 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16958 = "torch.aten.add.Scalar"(%16925, %16956, %16957) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%16958, %1478) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %16959 = "torch.prim.ListConstruct"(%1924) : (!torch.int) -> !torch.list<int>
    %16960 = "torch.aten.view"(%16958, %16959) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%16960, %1478) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %16961 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16962 = "torch.aten.index_select"(%16942, %16961, %16960) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int, !torch.vtensor<[?],si64>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16962, %1478) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16963 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16964 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16965 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16966 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16967 = "torch.prim.ListConstruct"(%16963, %1481, %16964, %16965, %16966) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16968 = "torch.aten.view"(%16962, %16967) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16968, %1478) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16969 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16970 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16971 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16972 = "torch.prim.ListConstruct"(%16969, %1485, %16970, %16971) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16973 = "torch.aten.view"(%16968, %16972) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16973, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16974 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16975 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16976 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16977 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16978 = "torch.aten.slice.Tensor"(%16955, %16974, %16975, %16976, %16977) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16978, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16979 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16980 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16981 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %16982 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %16983 = "torch.aten.slice.Tensor"(%16973, %16979, %16980, %16981, %16982) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16983, %1478) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16984 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %16985 = "torch.aten.unsqueeze"(%16978, %16984) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16985, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16986 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16987 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %16988 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16989 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16990 = "torch.prim.ListConstruct"(%16986, %1485, %16987, %16988, %16989) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16991 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %16992 = "torch.aten.expand"(%16985, %16990, %16991) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16992, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16993 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %16994 = "torch.aten.clone"(%16992, %16993) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16994, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %16995 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %16996 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %16997 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %16998 = "torch.prim.ListConstruct"(%16995, %1485, %16996, %16997) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16999 = "torch.aten._unsafe_view"(%16994, %16998) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%16999, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %17000 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %17001 = "torch.aten.unsqueeze"(%16983, %17000) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17001, %1478) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %17002 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17003 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %17004 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17005 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %17006 = "torch.prim.ListConstruct"(%17002, %1485, %17003, %17004, %17005) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17007 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %17008 = "torch.aten.expand"(%17001, %17006, %17007) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17008, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %17009 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %17010 = "torch.aten.clone"(%17008, %17009) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17010, %1478) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %17011 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17012 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %17013 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %17014 = "torch.prim.ListConstruct"(%17011, %1485, %17012, %17013) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17015 = "torch.aten._unsafe_view"(%17010, %17014) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17015, %1478) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %17016 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17017 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %17018 = "torch.aten.transpose.int"(%16800, %17016, %17017) : (!torch.vtensor<[4,1,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,1,128],f8E4M3FNUZ>
    %17019 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17020 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %17021 = "torch.aten.transpose.int"(%16999, %17019, %17020) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17021, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %17022 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17023 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %17024 = "torch.aten.transpose.int"(%17015, %17022, %17023) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17024, %1478) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %17025 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %17026 = "torch.aten.squeeze.dim"(%1516, %17025) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17026, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %17027 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %17028 = "torch.aten.squeeze.dim"(%17026, %17027) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,1,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%17028, %1478) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>, !torch.int) -> ()
    %17029 = "torch_c.to_builtin_tensor"(%17018) : (!torch.vtensor<[4,32,1,128],f8E4M3FNUZ>) -> tensor<4x32x1x128xf8E4M3FNUZ>
    %17030 = "tensor.cast"(%17029) : (tensor<4x32x1x128xf8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %17031 = "torch_c.to_builtin_tensor"(%17021) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %17032 = "torch_c.to_builtin_tensor"(%17024) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %17033 = "torch_c.to_builtin_tensor"(%17028) : (!torch.vtensor<[4,1,1,?],f8E4M3FNUZ>) -> tensor<4x1x1x?xf8E4M3FNUZ>
    %17034 = "tensor.cast"(%17033) : (tensor<4x1x1x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %17035 = "torch_c.to_builtin_tensor"(%1454) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %17036 = "util.call"(%17030, %17031, %17032, %17035, %17034) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %17037 = "tensor.cast"(%17036) : (tensor<4x32x?x128xf32>) -> tensor<4x32x1x128xf32>
    %17038 = "torch_c.from_builtin_tensor"(%17037) : (tensor<4x32x1x128xf32>) -> !torch.vtensor<[4,32,1,128],f32>
    %17039 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17040 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %17041 = "torch.aten.transpose.int"(%17038, %17039, %17040) : (!torch.vtensor<[4,32,1,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,1,32,128],f32>
    %17042 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17043 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17044 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %17045 = "torch.prim.ListConstruct"(%17042, %17043, %17044) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17046 = "torch.aten.view"(%17041, %17045) : (!torch.vtensor<[4,1,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,1,4096],f32>
    %17047 = "torch.aten.div.Tensor"(%17046, %1456) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %17048 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %17049 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %17050 = "torch.aten.clamp"(%17047, %17048, %17049) : (!torch.vtensor<[4,1,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],f32>
    %17051 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %17052 = "torch.prims.convert_element_type"(%17050, %17051) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %17053 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %17054 = "torch.aten.unsqueeze"(%1458, %17053) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %17055 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17056 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %17057 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %17058 = "torch.prim.ListConstruct"(%17055, %17056, %17057) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17059 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %17060 = "torch.aten.expand"(%17054, %17058, %17059) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %17061 = "torch_c.to_builtin_tensor"(%17052) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %17062 = "torch_c.to_builtin_tensor"(%17060) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %17063 = "util.call"(%17061, %17062) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %17064 = "torch_c.from_builtin_tensor"(%17063) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %17065 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17066 = "torch.prims.convert_element_type"(%17064, %17065) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17067 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17068 = "torch.aten.add.Tensor"(%16667, %17066, %17067) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17069 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %17070 = "torch.prims.convert_element_type"(%17068, %17069) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %17071 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %17072 = "torch.aten.pow.Tensor_Scalar"(%17070, %17071) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %17073 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %17074 = "torch.prim.ListConstruct"(%17073) : (!torch.int) -> !torch.list<int>
    %17075 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %17076 = "torch.constant.none"() : () -> !torch.none
    %17077 = "torch.aten.mean.dim"(%17072, %17074, %17075, %17076) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %17078 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %17079 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17080 = "torch.aten.add.Scalar"(%17077, %17078, %17079) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %17081 = "torch.aten.rsqrt"(%17080) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %17082 = "torch.aten.mul.Tensor"(%17070, %17081) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %17083 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17084 = "torch.prims.convert_element_type"(%17082, %17083) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17085 = "torch.aten.mul.Tensor"(%1460, %17084) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %17086 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17087 = "torch.prims.convert_element_type"(%17085, %17086) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17088 = "torch.aten.div.Tensor"(%17087, %1462) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %17089 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %17090 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %17091 = "torch.aten.clamp"(%17088, %17089, %17090) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %17092 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %17093 = "torch.prims.convert_element_type"(%17091, %17092) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %17094 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %17095 = "torch.aten.unsqueeze"(%1464, %17094) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %17096 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17097 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %17098 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %17099 = "torch.prim.ListConstruct"(%17096, %17097, %17098) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17100 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %17101 = "torch.aten.expand"(%17095, %17099, %17100) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %17102 = "torch_c.to_builtin_tensor"(%17093) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %17103 = "torch_c.to_builtin_tensor"(%17101) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %17104 = "util.call"(%17102, %17103) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %17105 = "torch_c.from_builtin_tensor"(%17104) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %17106 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17107 = "torch.prims.convert_element_type"(%17105, %17106) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %17108 = "torch.aten.silu"(%17107) : (!torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %17109 = "torch.aten.div.Tensor"(%17087, %1466) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,4096],bf16>
    %17110 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %17111 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %17112 = "torch.aten.clamp"(%17109, %17110, %17111) : (!torch.vtensor<[4,1,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,4096],bf16>
    %17113 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %17114 = "torch.prims.convert_element_type"(%17112, %17113) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f8E4M3FNUZ>
    %17115 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %17116 = "torch.aten.unsqueeze"(%1468, %17115) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %17117 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17118 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %17119 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %17120 = "torch.prim.ListConstruct"(%17117, %17118, %17119) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17121 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %17122 = "torch.aten.expand"(%17116, %17120, %17121) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %17123 = "torch_c.to_builtin_tensor"(%17114) : (!torch.vtensor<[4,1,4096],f8E4M3FNUZ>) -> tensor<4x1x4096xf8E4M3FNUZ>
    %17124 = "torch_c.to_builtin_tensor"(%17122) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %17125 = "util.call"(%17123, %17124) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>
    %17126 = "torch_c.from_builtin_tensor"(%17125) : (tensor<4x1x14336xf32>) -> !torch.vtensor<[4,1,14336],f32>
    %17127 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17128 = "torch.prims.convert_element_type"(%17126, %17127) : (!torch.vtensor<[4,1,14336],f32>, !torch.int) -> !torch.vtensor<[4,1,14336],bf16>
    %17129 = "torch.aten.mul.Tensor"(%17108, %17128) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[4,1,14336],bf16>) -> !torch.vtensor<[4,1,14336],bf16>
    %17130 = "torch.aten.div.Tensor"(%17129, %1470) : (!torch.vtensor<[4,1,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,14336],bf16>
    %17131 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %17132 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %17133 = "torch.aten.clamp"(%17130, %17131, %17132) : (!torch.vtensor<[4,1,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,1,14336],bf16>
    %17134 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %17135 = "torch.prims.convert_element_type"(%17133, %17134) : (!torch.vtensor<[4,1,14336],bf16>, !torch.int) -> !torch.vtensor<[4,1,14336],f8E4M3FNUZ>
    %17136 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %17137 = "torch.aten.unsqueeze"(%1472, %17136) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %17138 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17139 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %17140 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %17141 = "torch.prim.ListConstruct"(%17138, %17139, %17140) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17142 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %17143 = "torch.aten.expand"(%17137, %17141, %17142) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %17144 = "torch_c.to_builtin_tensor"(%17135) : (!torch.vtensor<[4,1,14336],f8E4M3FNUZ>) -> tensor<4x1x14336xf8E4M3FNUZ>
    %17145 = "torch_c.to_builtin_tensor"(%17143) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %17146 = "util.call"(%17144, %17145) <{callee = @sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>
    %17147 = "torch_c.from_builtin_tensor"(%17146) : (tensor<4x1x4096xf32>) -> !torch.vtensor<[4,1,4096],f32>
    %17148 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17149 = "torch.prims.convert_element_type"(%17147, %17148) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17150 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17151 = "torch.aten.add.Tensor"(%17068, %17149, %17150) : (!torch.vtensor<[4,1,4096],bf16>, !torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17152 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %17153 = "torch.prims.convert_element_type"(%17151, %17152) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %17154 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %17155 = "torch.aten.pow.Tensor_Scalar"(%17153, %17154) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],f32>
    %17156 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %17157 = "torch.prim.ListConstruct"(%17156) : (!torch.int) -> !torch.list<int>
    %17158 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %17159 = "torch.constant.none"() : () -> !torch.none
    %17160 = "torch.aten.mean.dim"(%17155, %17157, %17158, %17159) : (!torch.vtensor<[4,1,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,1,1],f32>
    %17161 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %17162 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17163 = "torch.aten.add.Scalar"(%17160, %17161, %17162) : (!torch.vtensor<[4,1,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,1,1],f32>
    %17164 = "torch.aten.rsqrt"(%17163) : (!torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,1],f32>
    %17165 = "torch.aten.mul.Tensor"(%17153, %17164) : (!torch.vtensor<[4,1,4096],f32>, !torch.vtensor<[4,1,1],f32>) -> !torch.vtensor<[4,1,4096],f32>
    %17166 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17167 = "torch.prims.convert_element_type"(%17165, %17166) : (!torch.vtensor<[4,1,4096],f32>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17168 = "torch.aten.mul.Tensor"(%1474, %17167) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,1,4096],bf16>) -> !torch.vtensor<[4,1,4096],bf16>
    %17169 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17170 = "torch.prims.convert_element_type"(%17168, %17169) : (!torch.vtensor<[4,1,4096],bf16>, !torch.int) -> !torch.vtensor<[4,1,4096],bf16>
    %17171 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %17172 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %17173 = "torch.aten.transpose.int"(%1476, %17171, %17172) : (!torch.vtensor<[128256,4096],bf16>, !torch.int, !torch.int) -> !torch.vtensor<[4096,128256],bf16>
    %17174 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %17175 = "torch.prims.convert_element_type"(%17173, %17174) : (!torch.vtensor<[4096,128256],bf16>, !torch.int) -> !torch.vtensor<[4096,128256],bf16>
    %17176 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17177 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %17178 = "torch.prim.ListConstruct"(%17176, %17177) : (!torch.int, !torch.int) -> !torch.list<int>
    %17179 = "torch.aten.view"(%17170, %17178) : (!torch.vtensor<[4,1,4096],bf16>, !torch.list<int>) -> !torch.vtensor<[4,4096],bf16>
    %17180 = "torch.aten.mm"(%17179, %17175) : (!torch.vtensor<[4,4096],bf16>, !torch.vtensor<[4096,128256],bf16>) -> !torch.vtensor<[4,128256],bf16>
    %17181 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %17182 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %17183 = "torch.constant.int"() <{value = 128256 : i64}> : () -> !torch.int
    %17184 = "torch.prim.ListConstruct"(%17181, %17182, %17183) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %17185 = "torch.aten.view"(%17180, %17184) : (!torch.vtensor<[4,128256],bf16>, !torch.list<int>) -> !torch.vtensor<[4,1,128256],bf16>
    "func.return"(%17185) : (!torch.vtensor<[4,1,128256],bf16>) -> ()
  }) {torch.assume_strict_symbolic_shapes} : () -> ()
  "util.func"() <{function_type = (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg55: tensor<4x?x4096xf8E4M3FNUZ>, %arg56: tensor<4x4096x4096xf8E4M3FNUZ>):
    %113 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %114 = "arith.constant"() <{value = 0 : index}> : () -> index
    %115 = "arith.constant"() <{value = 1 : index}> : () -> index
    %116 = "tensor.dim"(%arg55, %114) : (tensor<4x?x4096xf8E4M3FNUZ>, index) -> index
    %117 = "tensor.dim"(%arg55, %115) : (tensor<4x?x4096xf8E4M3FNUZ>, index) -> index
    %118 = "tensor.dim"(%arg56, %115) : (tensor<4x4096x4096xf8E4M3FNUZ>, index) -> index
    %119 = "tensor.empty"(%116, %117, %118) : (index, index, index) -> tensor<?x?x?xf32>
    %120 = "tensor.cast"(%119) : (tensor<?x?x?xf32>) -> tensor<4x?x4096xf32>
    %121 = "linalg.fill"(%113, %120) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg60: f32, %arg61: f32):
      "linalg.yield"(%arg60) : (f32) -> ()
    }) : (f32, tensor<4x?x4096xf32>) -> tensor<4x?x4096xf32>
    %122 = "linalg.batch_matmul_transpose_b"(%arg55, %arg56, %121) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg57: f8E4M3FNUZ, %arg58: f8E4M3FNUZ, %arg59: f32):
      %123 = "arith.extf"(%arg57) : (f8E4M3FNUZ) -> f32
      %124 = "arith.extf"(%arg58) : (f8E4M3FNUZ) -> f32
      %125 = "arith.mulf"(%123, %124) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %126 = "arith.addf"(%arg59, %125) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%126) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>, tensor<4x?x4096xf32>) -> tensor<4x?x4096xf32>
    "util.return"(%122) : (tensor<4x?x4096xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg48: tensor<4x?x4096xf8E4M3FNUZ>, %arg49: tensor<4x1024x4096xf8E4M3FNUZ>):
    %99 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %100 = "arith.constant"() <{value = 0 : index}> : () -> index
    %101 = "arith.constant"() <{value = 1 : index}> : () -> index
    %102 = "tensor.dim"(%arg48, %100) : (tensor<4x?x4096xf8E4M3FNUZ>, index) -> index
    %103 = "tensor.dim"(%arg48, %101) : (tensor<4x?x4096xf8E4M3FNUZ>, index) -> index
    %104 = "tensor.dim"(%arg49, %101) : (tensor<4x1024x4096xf8E4M3FNUZ>, index) -> index
    %105 = "tensor.empty"(%102, %103, %104) : (index, index, index) -> tensor<?x?x?xf32>
    %106 = "tensor.cast"(%105) : (tensor<?x?x?xf32>) -> tensor<4x?x1024xf32>
    %107 = "linalg.fill"(%99, %106) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg53: f32, %arg54: f32):
      "linalg.yield"(%arg53) : (f32) -> ()
    }) : (f32, tensor<4x?x1024xf32>) -> tensor<4x?x1024xf32>
    %108 = "linalg.batch_matmul_transpose_b"(%arg48, %arg49, %107) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg50: f8E4M3FNUZ, %arg51: f8E4M3FNUZ, %arg52: f32):
      %109 = "arith.extf"(%arg50) : (f8E4M3FNUZ) -> f32
      %110 = "arith.extf"(%arg51) : (f8E4M3FNUZ) -> f32
      %111 = "arith.mulf"(%109, %110) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %112 = "arith.addf"(%arg52, %111) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%112) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>, tensor<4x?x1024xf32>) -> tensor<4x?x1024xf32>
    "util.return"(%108) : (tensor<4x?x1024xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>, sym_name = "sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg42: tensor<4x32x?x128xf8E4M3FNUZ>, %arg43: tensor<4x32x?x128xf8E4M3FNUZ>, %arg44: tensor<4x32x?x128xf8E4M3FNUZ>, %arg45: tensor<f32>, %arg46: tensor<?x?xf8E4M3FNUZ>):
    %84 = "arith.constant"() <{value = 0 : index}> : () -> index
    %85 = "arith.constant"() <{value = 1 : index}> : () -> index
    %86 = "arith.constant"() <{value = 2 : index}> : () -> index
    %87 = "arith.constant"() <{value = 3 : index}> : () -> index
    %88 = "arith.constant"() <{value = 128 : index}> : () -> index
    %89 = "tensor.dim"(%arg42, %86) : (tensor<4x32x?x128xf8E4M3FNUZ>, index) -> index
    %90 = "tensor.dim"(%arg44, %87) : (tensor<4x32x?x128xf8E4M3FNUZ>, index) -> index
    %91 = "tensor.extract"(%arg45) : (tensor<f32>) -> f32
    %92 = "tensor.empty"(%88, %89, %90) : (index, index, index) -> tensor<?x?x?xf32>
    %93 = "tensor.cast"(%92) : (tensor<?x?x?xf32>) -> tensor<128x?x128xf32>
    %94 = "tensor.collapse_shape"(%arg42) <{reassociation = [[0, 1], [2], [3]]}> : (tensor<4x32x?x128xf8E4M3FNUZ>) -> tensor<128x?x128xf8E4M3FNUZ>
    %95 = "tensor.collapse_shape"(%arg43) <{reassociation = [[0, 1], [2], [3]]}> : (tensor<4x32x?x128xf8E4M3FNUZ>) -> tensor<128x?x128xf8E4M3FNUZ>
    %96 = "tensor.collapse_shape"(%arg44) <{reassociation = [[0, 1], [2], [3]]}> : (tensor<4x32x?x128xf8E4M3FNUZ>) -> tensor<128x?x128xf8E4M3FNUZ>
    %97 = "iree_linalg_ext.attention"(%94, %95, %96, %91, %arg46, %93) <{indexing_maps = [#map35, #map36, #map37, #map38, #map39, #map40]}> ({
    ^bb0(%arg47: f32):
      "iree_linalg_ext.yield"(%arg47) : (f32) -> ()
    }) : (tensor<128x?x128xf8E4M3FNUZ>, tensor<128x?x128xf8E4M3FNUZ>, tensor<128x?x128xf8E4M3FNUZ>, f32, tensor<?x?xf8E4M3FNUZ>, tensor<128x?x128xf32>) -> tensor<128x?x128xf32>
    %98 = "tensor.expand_shape"(%97, %89) <{reassociation = [[0, 1], [2], [3]], static_output_shape = array<i64: 4, 32, -9223372036854775808, 128>}> : (tensor<128x?x128xf32>, index) -> tensor<4x32x?x128xf32>
    "util.return"(%98) : (tensor<4x32x?x128xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg35: tensor<4x?x4096xf8E4M3FNUZ>, %arg36: tensor<4x14336x4096xf8E4M3FNUZ>):
    %70 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %71 = "arith.constant"() <{value = 0 : index}> : () -> index
    %72 = "arith.constant"() <{value = 1 : index}> : () -> index
    %73 = "tensor.dim"(%arg35, %71) : (tensor<4x?x4096xf8E4M3FNUZ>, index) -> index
    %74 = "tensor.dim"(%arg35, %72) : (tensor<4x?x4096xf8E4M3FNUZ>, index) -> index
    %75 = "tensor.dim"(%arg36, %72) : (tensor<4x14336x4096xf8E4M3FNUZ>, index) -> index
    %76 = "tensor.empty"(%73, %74, %75) : (index, index, index) -> tensor<?x?x?xf32>
    %77 = "tensor.cast"(%76) : (tensor<?x?x?xf32>) -> tensor<4x?x14336xf32>
    %78 = "linalg.fill"(%70, %77) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg40: f32, %arg41: f32):
      "linalg.yield"(%arg40) : (f32) -> ()
    }) : (f32, tensor<4x?x14336xf32>) -> tensor<4x?x14336xf32>
    %79 = "linalg.batch_matmul_transpose_b"(%arg35, %arg36, %78) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg37: f8E4M3FNUZ, %arg38: f8E4M3FNUZ, %arg39: f32):
      %80 = "arith.extf"(%arg37) : (f8E4M3FNUZ) -> f32
      %81 = "arith.extf"(%arg38) : (f8E4M3FNUZ) -> f32
      %82 = "arith.mulf"(%80, %81) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %83 = "arith.addf"(%arg39, %82) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%83) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>, tensor<4x?x14336xf32>) -> tensor<4x?x14336xf32>
    "util.return"(%79) : (tensor<4x?x14336xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg28: tensor<4x?x14336xf8E4M3FNUZ>, %arg29: tensor<4x4096x14336xf8E4M3FNUZ>):
    %56 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %57 = "arith.constant"() <{value = 0 : index}> : () -> index
    %58 = "arith.constant"() <{value = 1 : index}> : () -> index
    %59 = "tensor.dim"(%arg28, %57) : (tensor<4x?x14336xf8E4M3FNUZ>, index) -> index
    %60 = "tensor.dim"(%arg28, %58) : (tensor<4x?x14336xf8E4M3FNUZ>, index) -> index
    %61 = "tensor.dim"(%arg29, %58) : (tensor<4x4096x14336xf8E4M3FNUZ>, index) -> index
    %62 = "tensor.empty"(%59, %60, %61) : (index, index, index) -> tensor<?x?x?xf32>
    %63 = "tensor.cast"(%62) : (tensor<?x?x?xf32>) -> tensor<4x?x4096xf32>
    %64 = "linalg.fill"(%56, %63) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg33: f32, %arg34: f32):
      "linalg.yield"(%arg33) : (f32) -> ()
    }) : (f32, tensor<4x?x4096xf32>) -> tensor<4x?x4096xf32>
    %65 = "linalg.batch_matmul_transpose_b"(%arg28, %arg29, %64) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg30: f8E4M3FNUZ, %arg31: f8E4M3FNUZ, %arg32: f32):
      %66 = "arith.extf"(%arg30) : (f8E4M3FNUZ) -> f32
      %67 = "arith.extf"(%arg31) : (f8E4M3FNUZ) -> f32
      %68 = "arith.mulf"(%66, %67) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %69 = "arith.addf"(%arg32, %68) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%69) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>, tensor<4x?x4096xf32>) -> tensor<4x?x4096xf32>
    "util.return"(%65) : (tensor<4x?x4096xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg21: tensor<4x1x4096xf8E4M3FNUZ>, %arg22: tensor<4x4096x4096xf8E4M3FNUZ>):
    %42 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %43 = "arith.constant"() <{value = 0 : index}> : () -> index
    %44 = "arith.constant"() <{value = 1 : index}> : () -> index
    %45 = "tensor.dim"(%arg21, %43) : (tensor<4x1x4096xf8E4M3FNUZ>, index) -> index
    %46 = "tensor.dim"(%arg21, %44) : (tensor<4x1x4096xf8E4M3FNUZ>, index) -> index
    %47 = "tensor.dim"(%arg22, %44) : (tensor<4x4096x4096xf8E4M3FNUZ>, index) -> index
    %48 = "tensor.empty"(%45, %46, %47) : (index, index, index) -> tensor<?x?x?xf32>
    %49 = "tensor.cast"(%48) : (tensor<?x?x?xf32>) -> tensor<4x1x4096xf32>
    %50 = "linalg.fill"(%42, %49) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg26: f32, %arg27: f32):
      "linalg.yield"(%arg26) : (f32) -> ()
    }) : (f32, tensor<4x1x4096xf32>) -> tensor<4x1x4096xf32>
    %51 = "linalg.batch_matmul_transpose_b"(%arg21, %arg22, %50) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg23: f8E4M3FNUZ, %arg24: f8E4M3FNUZ, %arg25: f32):
      %52 = "arith.extf"(%arg23) : (f8E4M3FNUZ) -> f32
      %53 = "arith.extf"(%arg24) : (f8E4M3FNUZ) -> f32
      %54 = "arith.mulf"(%52, %53) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %55 = "arith.addf"(%arg25, %54) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%55) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>, tensor<4x1x4096xf32>) -> tensor<4x1x4096xf32>
    "util.return"(%51) : (tensor<4x1x4096xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x1x1024xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg14: tensor<4x1x4096xf8E4M3FNUZ>, %arg15: tensor<4x1024x4096xf8E4M3FNUZ>):
    %28 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %29 = "arith.constant"() <{value = 0 : index}> : () -> index
    %30 = "arith.constant"() <{value = 1 : index}> : () -> index
    %31 = "tensor.dim"(%arg14, %29) : (tensor<4x1x4096xf8E4M3FNUZ>, index) -> index
    %32 = "tensor.dim"(%arg14, %30) : (tensor<4x1x4096xf8E4M3FNUZ>, index) -> index
    %33 = "tensor.dim"(%arg15, %30) : (tensor<4x1024x4096xf8E4M3FNUZ>, index) -> index
    %34 = "tensor.empty"(%31, %32, %33) : (index, index, index) -> tensor<?x?x?xf32>
    %35 = "tensor.cast"(%34) : (tensor<?x?x?xf32>) -> tensor<4x1x1024xf32>
    %36 = "linalg.fill"(%28, %35) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg19: f32, %arg20: f32):
      "linalg.yield"(%arg19) : (f32) -> ()
    }) : (f32, tensor<4x1x1024xf32>) -> tensor<4x1x1024xf32>
    %37 = "linalg.batch_matmul_transpose_b"(%arg14, %arg15, %36) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg16: f8E4M3FNUZ, %arg17: f8E4M3FNUZ, %arg18: f32):
      %38 = "arith.extf"(%arg16) : (f8E4M3FNUZ) -> f32
      %39 = "arith.extf"(%arg17) : (f8E4M3FNUZ) -> f32
      %40 = "arith.mulf"(%38, %39) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %41 = "arith.addf"(%arg18, %40) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%41) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>, tensor<4x1x1024xf32>) -> tensor<4x1x1024xf32>
    "util.return"(%37) : (tensor<4x1x1024xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x1x14336xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4x1x4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg7: tensor<4x1x4096xf8E4M3FNUZ>, %arg8: tensor<4x14336x4096xf8E4M3FNUZ>):
    %14 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %15 = "arith.constant"() <{value = 0 : index}> : () -> index
    %16 = "arith.constant"() <{value = 1 : index}> : () -> index
    %17 = "tensor.dim"(%arg7, %15) : (tensor<4x1x4096xf8E4M3FNUZ>, index) -> index
    %18 = "tensor.dim"(%arg7, %16) : (tensor<4x1x4096xf8E4M3FNUZ>, index) -> index
    %19 = "tensor.dim"(%arg8, %16) : (tensor<4x14336x4096xf8E4M3FNUZ>, index) -> index
    %20 = "tensor.empty"(%17, %18, %19) : (index, index, index) -> tensor<?x?x?xf32>
    %21 = "tensor.cast"(%20) : (tensor<?x?x?xf32>) -> tensor<4x1x14336xf32>
    %22 = "linalg.fill"(%14, %21) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg12: f32, %arg13: f32):
      "linalg.yield"(%arg12) : (f32) -> ()
    }) : (f32, tensor<4x1x14336xf32>) -> tensor<4x1x14336xf32>
    %23 = "linalg.batch_matmul_transpose_b"(%arg7, %arg8, %22) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg9: f8E4M3FNUZ, %arg10: f8E4M3FNUZ, %arg11: f32):
      %24 = "arith.extf"(%arg9) : (f8E4M3FNUZ) -> f32
      %25 = "arith.extf"(%arg10) : (f8E4M3FNUZ) -> f32
      %26 = "arith.mulf"(%24, %25) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %27 = "arith.addf"(%arg11, %26) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%27) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x1x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>, tensor<4x1x14336xf32>) -> tensor<4x1x14336xf32>
    "util.return"(%23) : (tensor<4x1x14336xf32>) -> ()
  }) : () -> ()
  "util.func"() <{function_type = (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x1x4096xf32>, sym_name = "sharktank_batch_matmul_transpose_b_L4x1x14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ", sym_visibility = "private", tied_operands = [-1 : index]}> ({
  ^bb0(%arg0: tensor<4x1x14336xf8E4M3FNUZ>, %arg1: tensor<4x4096x14336xf8E4M3FNUZ>):
    %0 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
    %1 = "arith.constant"() <{value = 0 : index}> : () -> index
    %2 = "arith.constant"() <{value = 1 : index}> : () -> index
    %3 = "tensor.dim"(%arg0, %1) : (tensor<4x1x14336xf8E4M3FNUZ>, index) -> index
    %4 = "tensor.dim"(%arg0, %2) : (tensor<4x1x14336xf8E4M3FNUZ>, index) -> index
    %5 = "tensor.dim"(%arg1, %2) : (tensor<4x4096x14336xf8E4M3FNUZ>, index) -> index
    %6 = "tensor.empty"(%3, %4, %5) : (index, index, index) -> tensor<?x?x?xf32>
    %7 = "tensor.cast"(%6) : (tensor<?x?x?xf32>) -> tensor<4x1x4096xf32>
    %8 = "linalg.fill"(%0, %7) <{operandSegmentSizes = array<i32: 1, 1>}> ({
    ^bb0(%arg5: f32, %arg6: f32):
      "linalg.yield"(%arg5) : (f32) -> ()
    }) : (f32, tensor<4x1x4096xf32>) -> tensor<4x1x4096xf32>
    %9 = "linalg.batch_matmul_transpose_b"(%arg0, %arg1, %8) <{operandSegmentSizes = array<i32: 2, 1>}> ({
    ^bb0(%arg2: f8E4M3FNUZ, %arg3: f8E4M3FNUZ, %arg4: f32):
      %10 = "arith.extf"(%arg2) : (f8E4M3FNUZ) -> f32
      %11 = "arith.extf"(%arg3) : (f8E4M3FNUZ) -> f32
      %12 = "arith.mulf"(%10, %11) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      %13 = "arith.addf"(%arg4, %12) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
      "linalg.yield"(%13) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map32, #map33, #map34]} : (tensor<4x1x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>, tensor<4x1x4096xf32>) -> tensor<4x1x4096xf32>
    "util.return"(%9) : (tensor<4x1x4096xf32>) -> ()
  }) : () -> ()
}) : () -> ()