AmosLewis · February 19, 2025 20:40
diff --git a/llama_fp8_attn8_bs4_input128.mlir b/llama_fp8_attn8_bs4_input128.mlir
 #map = affine_map<()[s0] -> (4, s0 * 32)>
 #map1 = affine_map<()[s0] -> (4, s0)>
 #map2 = affine_map<()[s0] -> (s0, 2097152)>
 #map3 = affine_map<()[s0] -> (s0 * 32)>
 #map4 = affine_map<()[s0] -> (1, 1, s0 * 32, 131072)>
 #map5 = affine_map<()[s0] -> (1, 1, s0 * 32, s0 * 32)>
 #map6 = affine_map<()[s0] -> (4, 1, s0 * 32)>
 #map7 = affine_map<()[s0] -> (4, 1, 1, s0 * 32)>
 #map8 = affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)>
 #map9 = affine_map<()[s0] -> (4, s0 * 32, 4096)>
 #map10 = affine_map<()[s0] -> (4, s0 * 32, 1)>
 #map11 = affine_map<()[s0] -> (4, s0 * 32, 1024)>
 #map12 = affine_map<()[s0] -> (4, s0 * 32, 32, 128)>
 #map13 = affine_map<()[s0] -> (4, s0 * 32, 8, 128)>
 #map14 = affine_map<()[s0] -> (s0 * 32, 128)>
 #map15 = affine_map<()[s0] -> (1, s0 * 32, 128)>
 #map16 = affine_map<()[s0] -> (1, s0 * 32, 1, 128)>
 #map17 = affine_map<()[s0] -> (4, s0 * 32, 1, 128)>
 #map18 = affine_map<()[s0] -> (4, s0 * 32, 32, 64)>
 #map19 = affine_map<()[s0] -> (4, s0 * 32, 8, 64)>
 #map20 = affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)>
 #map21 = affine_map<()[s0] -> (s0 * 64, 32, 8, 128)>
 #map22 = affine_map<()[s0] -> (4, s0, 32, 8, 128)>
 #map23 = affine_map<()[s0] -> (s0 * 4, 32, 8, 128)>
 #map24 = affine_map<()[s0] -> (s0 * 4)>
 #map25 = affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)>
 #map26 = affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)>
 #map27 = affine_map<()[s0] -> (4, 32, s0 * 32, 128)>
 #map28 = affine_map<()[s0] -> (4, s0 * 32, 14336)>
 #map29 = affine_map<()[s0] -> (s0 * 128, 4096)>
 #map30 = affine_map<()[s0] -> (s0 * 128, 128256)>
 #map31 = affine_map<()[s0] -> (4, s0 * 32, 128256)>
 #map32 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map33 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
 #map34 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 #map35 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
 #map36 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>
 #map37 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2)>
 #map38 = affine_map<(d0, d1, d2, d3, d4) -> ()>
 #map39 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)>
 #map40 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
 "builtin.module"() <{sym_name = "module"}> ({
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>, sym_name = "__auto.token_embd.weight", sym_visibility = "private", type = tensor<128256x4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.0.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.0.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.0.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.0.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.0.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.1.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.1.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.1.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.1.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.1.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.2.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.2.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.2.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.2.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.2.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.3.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.3.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.3.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.3.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.3.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.4.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.4.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.4.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.4.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.4.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.5.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.5.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.5.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.5.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.5.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.6.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.6.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.6.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.6.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.6.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.7.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.7.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.7.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.7.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.7.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.8.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.8.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.8.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.8.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.8.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.9.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.9.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.9.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.9.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.9.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.10.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.10.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.10.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.10.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.10.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.11.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.11.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.11.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.11.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.11.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.12.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.12.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.12.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.12.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.12.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.13.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.13.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.13.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.13.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.13.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.14.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.14.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.14.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.14.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.14.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.15.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.15.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.15.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.15.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.15.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.16.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.16.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.16.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.16.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.16.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.17.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.17.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.17.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.17.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.17.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.18.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.18.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.18.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.18.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.18.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.19.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.19.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.19.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.19.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.19.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.20.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.20.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.20.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.20.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.20.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.21.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.21.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.21.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.21.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.21.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.22.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.22.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.22.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.22.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.22.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.23.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.23.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.23.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.23.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.23.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.24.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.24.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.24.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.24.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.24.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.25.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.25.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.25.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.25.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.25.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.26.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.26.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.26.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.26.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.26.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.27.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.27.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.27.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.27.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.27.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.28.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.28.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.28.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.28.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.28.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.29.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.29.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.29.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.29.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.29.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.30.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.30.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.30.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.30.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.30.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.31.attn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_q.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_q.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_q.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_q.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_k.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_k.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_k.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_k.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_v.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_v.weight:qs", sym_visibility = "private", type = tensor<1024x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_v.q_output:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_v.q_output:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_scale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_scale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.attn_output.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.attn_output.weight:qs", sym_visibility = "private", type = tensor<4096x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.blk.31.ffn_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.ffn_gate.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.ffn_gate.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.ffn_up.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>, sym_name = "__auto.blk.31.ffn_up.weight:qs", sym_visibility = "private", type = tensor<14336x4096xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32>, sym_name = "__auto.blk.31.ffn_down.q_input:rscale", sym_visibility = "private", type = tensor<f32>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>, sym_name = "__auto.blk.31.ffn_down.weight:qs", sym_visibility = "private", type = tensor<4096x14336xf8E4M3FNUZ>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16>, sym_name = "__auto.output_norm.weight", sym_visibility = "private", type = tensor<4096xbf16>}> : () -> ()
  "util.global"() <{initial_value = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16>, sym_name = "__auto.output.weight", sym_visibility = "private", type = tensor<128256x4096xbf16>}> : () -> ()
  "func.func"() <{arg_attrs = [{}, {}, {}, {}], function_type = (!torch.vtensor<[4,?],si64>, !torch.vtensor<[4],si64>, !torch.vtensor<[4,?],si64>, !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,128256],bf16>, sym_name = "prefill_bs4"}> ({
  ^bb0(%arg67: !torch.vtensor<[4,?],si64>, %arg68: !torch.vtensor<[4],si64>, %arg69: !torch.vtensor<[4,?],si64>, %arg70: !torch.tensor<[?,2097152],f8E4M3FNUZ>):
    %17186 = "util.global.load"() <{global = @__auto.token_embd.weight}> : () -> tensor<128256x4096xbf16>
    %17187 = "torch_c.from_builtin_tensor"(%17186) : (tensor<128256x4096xbf16>) -> !torch.vtensor<[128256,4096],bf16>
    %17188 = "util.global.load"() <{global = @__auto.blk.0.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17189 = "torch_c.from_builtin_tensor"(%17188) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17190 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17191 = "torch_c.from_builtin_tensor"(%17190) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17192 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17193 = "torch_c.from_builtin_tensor"(%17192) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17194 = "util.global.load"() <{global = @"__auto.blk.0.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17195 = "torch_c.from_builtin_tensor"(%17194) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17196 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17197 = "torch_c.from_builtin_tensor"(%17196) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17198 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17199 = "torch_c.from_builtin_tensor"(%17198) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17200 = "util.global.load"() <{global = @"__auto.blk.0.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17201 = "torch_c.from_builtin_tensor"(%17200) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17202 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17203 = "torch_c.from_builtin_tensor"(%17202) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17204 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17205 = "torch_c.from_builtin_tensor"(%17204) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17206 = "util.global.load"() <{global = @"__auto.blk.0.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17207 = "torch_c.from_builtin_tensor"(%17206) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17208 = "util.global.load"() <{global = @__auto.blk.0.attn_scale}> : () -> tensor<f32>
    %17209 = "torch_c.from_builtin_tensor"(%17208) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17210 = "util.global.load"() <{global = @"__auto.blk.0.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17211 = "torch_c.from_builtin_tensor"(%17210) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17212 = "util.global.load"() <{global = @"__auto.blk.0.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17213 = "torch_c.from_builtin_tensor"(%17212) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17214 = "util.global.load"() <{global = @__auto.blk.0.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17215 = "torch_c.from_builtin_tensor"(%17214) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17216 = "util.global.load"() <{global = @"__auto.blk.0.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17217 = "torch_c.from_builtin_tensor"(%17216) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17218 = "util.global.load"() <{global = @"__auto.blk.0.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17219 = "torch_c.from_builtin_tensor"(%17218) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17220 = "util.global.load"() <{global = @"__auto.blk.0.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17221 = "torch_c.from_builtin_tensor"(%17220) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17222 = "util.global.load"() <{global = @"__auto.blk.0.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17223 = "torch_c.from_builtin_tensor"(%17222) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17224 = "util.global.load"() <{global = @"__auto.blk.0.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17225 = "torch_c.from_builtin_tensor"(%17224) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17226 = "util.global.load"() <{global = @"__auto.blk.0.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17227 = "torch_c.from_builtin_tensor"(%17226) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17228 = "util.global.load"() <{global = @__auto.blk.1.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17229 = "torch_c.from_builtin_tensor"(%17228) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17230 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17231 = "torch_c.from_builtin_tensor"(%17230) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17232 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17233 = "torch_c.from_builtin_tensor"(%17232) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17234 = "util.global.load"() <{global = @"__auto.blk.1.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17235 = "torch_c.from_builtin_tensor"(%17234) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17236 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17237 = "torch_c.from_builtin_tensor"(%17236) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17238 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17239 = "torch_c.from_builtin_tensor"(%17238) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17240 = "util.global.load"() <{global = @"__auto.blk.1.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17241 = "torch_c.from_builtin_tensor"(%17240) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17242 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17243 = "torch_c.from_builtin_tensor"(%17242) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17244 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17245 = "torch_c.from_builtin_tensor"(%17244) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17246 = "util.global.load"() <{global = @"__auto.blk.1.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17247 = "torch_c.from_builtin_tensor"(%17246) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17248 = "util.global.load"() <{global = @__auto.blk.1.attn_scale}> : () -> tensor<f32>
    %17249 = "torch_c.from_builtin_tensor"(%17248) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17250 = "util.global.load"() <{global = @"__auto.blk.1.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17251 = "torch_c.from_builtin_tensor"(%17250) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17252 = "util.global.load"() <{global = @"__auto.blk.1.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17253 = "torch_c.from_builtin_tensor"(%17252) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17254 = "util.global.load"() <{global = @__auto.blk.1.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17255 = "torch_c.from_builtin_tensor"(%17254) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17256 = "util.global.load"() <{global = @"__auto.blk.1.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17257 = "torch_c.from_builtin_tensor"(%17256) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17258 = "util.global.load"() <{global = @"__auto.blk.1.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17259 = "torch_c.from_builtin_tensor"(%17258) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17260 = "util.global.load"() <{global = @"__auto.blk.1.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17261 = "torch_c.from_builtin_tensor"(%17260) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17262 = "util.global.load"() <{global = @"__auto.blk.1.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17263 = "torch_c.from_builtin_tensor"(%17262) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17264 = "util.global.load"() <{global = @"__auto.blk.1.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17265 = "torch_c.from_builtin_tensor"(%17264) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17266 = "util.global.load"() <{global = @"__auto.blk.1.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17267 = "torch_c.from_builtin_tensor"(%17266) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17268 = "util.global.load"() <{global = @__auto.blk.2.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17269 = "torch_c.from_builtin_tensor"(%17268) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17270 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17271 = "torch_c.from_builtin_tensor"(%17270) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17272 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17273 = "torch_c.from_builtin_tensor"(%17272) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17274 = "util.global.load"() <{global = @"__auto.blk.2.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17275 = "torch_c.from_builtin_tensor"(%17274) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17276 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17277 = "torch_c.from_builtin_tensor"(%17276) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17278 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17279 = "torch_c.from_builtin_tensor"(%17278) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17280 = "util.global.load"() <{global = @"__auto.blk.2.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17281 = "torch_c.from_builtin_tensor"(%17280) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17282 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17283 = "torch_c.from_builtin_tensor"(%17282) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17284 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17285 = "torch_c.from_builtin_tensor"(%17284) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17286 = "util.global.load"() <{global = @"__auto.blk.2.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17287 = "torch_c.from_builtin_tensor"(%17286) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17288 = "util.global.load"() <{global = @__auto.blk.2.attn_scale}> : () -> tensor<f32>
    %17289 = "torch_c.from_builtin_tensor"(%17288) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17290 = "util.global.load"() <{global = @"__auto.blk.2.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17291 = "torch_c.from_builtin_tensor"(%17290) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17292 = "util.global.load"() <{global = @"__auto.blk.2.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17293 = "torch_c.from_builtin_tensor"(%17292) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17294 = "util.global.load"() <{global = @__auto.blk.2.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17295 = "torch_c.from_builtin_tensor"(%17294) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17296 = "util.global.load"() <{global = @"__auto.blk.2.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17297 = "torch_c.from_builtin_tensor"(%17296) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17298 = "util.global.load"() <{global = @"__auto.blk.2.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17299 = "torch_c.from_builtin_tensor"(%17298) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17300 = "util.global.load"() <{global = @"__auto.blk.2.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17301 = "torch_c.from_builtin_tensor"(%17300) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17302 = "util.global.load"() <{global = @"__auto.blk.2.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17303 = "torch_c.from_builtin_tensor"(%17302) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17304 = "util.global.load"() <{global = @"__auto.blk.2.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17305 = "torch_c.from_builtin_tensor"(%17304) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17306 = "util.global.load"() <{global = @"__auto.blk.2.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17307 = "torch_c.from_builtin_tensor"(%17306) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17308 = "util.global.load"() <{global = @__auto.blk.3.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17309 = "torch_c.from_builtin_tensor"(%17308) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17310 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17311 = "torch_c.from_builtin_tensor"(%17310) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17312 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17313 = "torch_c.from_builtin_tensor"(%17312) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17314 = "util.global.load"() <{global = @"__auto.blk.3.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17315 = "torch_c.from_builtin_tensor"(%17314) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17316 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17317 = "torch_c.from_builtin_tensor"(%17316) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17318 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17319 = "torch_c.from_builtin_tensor"(%17318) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17320 = "util.global.load"() <{global = @"__auto.blk.3.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17321 = "torch_c.from_builtin_tensor"(%17320) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17322 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17323 = "torch_c.from_builtin_tensor"(%17322) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17324 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17325 = "torch_c.from_builtin_tensor"(%17324) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17326 = "util.global.load"() <{global = @"__auto.blk.3.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17327 = "torch_c.from_builtin_tensor"(%17326) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17328 = "util.global.load"() <{global = @__auto.blk.3.attn_scale}> : () -> tensor<f32>
    %17329 = "torch_c.from_builtin_tensor"(%17328) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17330 = "util.global.load"() <{global = @"__auto.blk.3.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17331 = "torch_c.from_builtin_tensor"(%17330) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17332 = "util.global.load"() <{global = @"__auto.blk.3.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17333 = "torch_c.from_builtin_tensor"(%17332) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17334 = "util.global.load"() <{global = @__auto.blk.3.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17335 = "torch_c.from_builtin_tensor"(%17334) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17336 = "util.global.load"() <{global = @"__auto.blk.3.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17337 = "torch_c.from_builtin_tensor"(%17336) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17338 = "util.global.load"() <{global = @"__auto.blk.3.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17339 = "torch_c.from_builtin_tensor"(%17338) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17340 = "util.global.load"() <{global = @"__auto.blk.3.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17341 = "torch_c.from_builtin_tensor"(%17340) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17342 = "util.global.load"() <{global = @"__auto.blk.3.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17343 = "torch_c.from_builtin_tensor"(%17342) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17344 = "util.global.load"() <{global = @"__auto.blk.3.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17345 = "torch_c.from_builtin_tensor"(%17344) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17346 = "util.global.load"() <{global = @"__auto.blk.3.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17347 = "torch_c.from_builtin_tensor"(%17346) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17348 = "util.global.load"() <{global = @__auto.blk.4.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17349 = "torch_c.from_builtin_tensor"(%17348) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17350 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17351 = "torch_c.from_builtin_tensor"(%17350) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17352 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17353 = "torch_c.from_builtin_tensor"(%17352) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17354 = "util.global.load"() <{global = @"__auto.blk.4.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17355 = "torch_c.from_builtin_tensor"(%17354) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17356 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17357 = "torch_c.from_builtin_tensor"(%17356) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17358 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17359 = "torch_c.from_builtin_tensor"(%17358) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17360 = "util.global.load"() <{global = @"__auto.blk.4.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17361 = "torch_c.from_builtin_tensor"(%17360) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17362 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17363 = "torch_c.from_builtin_tensor"(%17362) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17364 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17365 = "torch_c.from_builtin_tensor"(%17364) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17366 = "util.global.load"() <{global = @"__auto.blk.4.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17367 = "torch_c.from_builtin_tensor"(%17366) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17368 = "util.global.load"() <{global = @__auto.blk.4.attn_scale}> : () -> tensor<f32>
    %17369 = "torch_c.from_builtin_tensor"(%17368) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17370 = "util.global.load"() <{global = @"__auto.blk.4.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17371 = "torch_c.from_builtin_tensor"(%17370) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17372 = "util.global.load"() <{global = @"__auto.blk.4.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17373 = "torch_c.from_builtin_tensor"(%17372) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17374 = "util.global.load"() <{global = @__auto.blk.4.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17375 = "torch_c.from_builtin_tensor"(%17374) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17376 = "util.global.load"() <{global = @"__auto.blk.4.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17377 = "torch_c.from_builtin_tensor"(%17376) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17378 = "util.global.load"() <{global = @"__auto.blk.4.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17379 = "torch_c.from_builtin_tensor"(%17378) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17380 = "util.global.load"() <{global = @"__auto.blk.4.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17381 = "torch_c.from_builtin_tensor"(%17380) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17382 = "util.global.load"() <{global = @"__auto.blk.4.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17383 = "torch_c.from_builtin_tensor"(%17382) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17384 = "util.global.load"() <{global = @"__auto.blk.4.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17385 = "torch_c.from_builtin_tensor"(%17384) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17386 = "util.global.load"() <{global = @"__auto.blk.4.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17387 = "torch_c.from_builtin_tensor"(%17386) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17388 = "util.global.load"() <{global = @__auto.blk.5.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17389 = "torch_c.from_builtin_tensor"(%17388) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17390 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17391 = "torch_c.from_builtin_tensor"(%17390) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17392 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17393 = "torch_c.from_builtin_tensor"(%17392) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17394 = "util.global.load"() <{global = @"__auto.blk.5.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17395 = "torch_c.from_builtin_tensor"(%17394) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17396 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17397 = "torch_c.from_builtin_tensor"(%17396) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17398 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17399 = "torch_c.from_builtin_tensor"(%17398) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17400 = "util.global.load"() <{global = @"__auto.blk.5.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17401 = "torch_c.from_builtin_tensor"(%17400) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17402 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17403 = "torch_c.from_builtin_tensor"(%17402) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17404 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17405 = "torch_c.from_builtin_tensor"(%17404) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17406 = "util.global.load"() <{global = @"__auto.blk.5.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17407 = "torch_c.from_builtin_tensor"(%17406) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17408 = "util.global.load"() <{global = @__auto.blk.5.attn_scale}> : () -> tensor<f32>
    %17409 = "torch_c.from_builtin_tensor"(%17408) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17410 = "util.global.load"() <{global = @"__auto.blk.5.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17411 = "torch_c.from_builtin_tensor"(%17410) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17412 = "util.global.load"() <{global = @"__auto.blk.5.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17413 = "torch_c.from_builtin_tensor"(%17412) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17414 = "util.global.load"() <{global = @__auto.blk.5.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17415 = "torch_c.from_builtin_tensor"(%17414) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17416 = "util.global.load"() <{global = @"__auto.blk.5.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17417 = "torch_c.from_builtin_tensor"(%17416) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17418 = "util.global.load"() <{global = @"__auto.blk.5.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17419 = "torch_c.from_builtin_tensor"(%17418) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17420 = "util.global.load"() <{global = @"__auto.blk.5.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17421 = "torch_c.from_builtin_tensor"(%17420) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17422 = "util.global.load"() <{global = @"__auto.blk.5.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17423 = "torch_c.from_builtin_tensor"(%17422) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17424 = "util.global.load"() <{global = @"__auto.blk.5.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17425 = "torch_c.from_builtin_tensor"(%17424) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17426 = "util.global.load"() <{global = @"__auto.blk.5.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17427 = "torch_c.from_builtin_tensor"(%17426) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17428 = "util.global.load"() <{global = @__auto.blk.6.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17429 = "torch_c.from_builtin_tensor"(%17428) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17430 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17431 = "torch_c.from_builtin_tensor"(%17430) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17432 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17433 = "torch_c.from_builtin_tensor"(%17432) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17434 = "util.global.load"() <{global = @"__auto.blk.6.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17435 = "torch_c.from_builtin_tensor"(%17434) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17436 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17437 = "torch_c.from_builtin_tensor"(%17436) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17438 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17439 = "torch_c.from_builtin_tensor"(%17438) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17440 = "util.global.load"() <{global = @"__auto.blk.6.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17441 = "torch_c.from_builtin_tensor"(%17440) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17442 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17443 = "torch_c.from_builtin_tensor"(%17442) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17444 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17445 = "torch_c.from_builtin_tensor"(%17444) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17446 = "util.global.load"() <{global = @"__auto.blk.6.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17447 = "torch_c.from_builtin_tensor"(%17446) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17448 = "util.global.load"() <{global = @__auto.blk.6.attn_scale}> : () -> tensor<f32>
    %17449 = "torch_c.from_builtin_tensor"(%17448) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17450 = "util.global.load"() <{global = @"__auto.blk.6.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17451 = "torch_c.from_builtin_tensor"(%17450) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17452 = "util.global.load"() <{global = @"__auto.blk.6.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17453 = "torch_c.from_builtin_tensor"(%17452) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17454 = "util.global.load"() <{global = @__auto.blk.6.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17455 = "torch_c.from_builtin_tensor"(%17454) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17456 = "util.global.load"() <{global = @"__auto.blk.6.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17457 = "torch_c.from_builtin_tensor"(%17456) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17458 = "util.global.load"() <{global = @"__auto.blk.6.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17459 = "torch_c.from_builtin_tensor"(%17458) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17460 = "util.global.load"() <{global = @"__auto.blk.6.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17461 = "torch_c.from_builtin_tensor"(%17460) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17462 = "util.global.load"() <{global = @"__auto.blk.6.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17463 = "torch_c.from_builtin_tensor"(%17462) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17464 = "util.global.load"() <{global = @"__auto.blk.6.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17465 = "torch_c.from_builtin_tensor"(%17464) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17466 = "util.global.load"() <{global = @"__auto.blk.6.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17467 = "torch_c.from_builtin_tensor"(%17466) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17468 = "util.global.load"() <{global = @__auto.blk.7.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17469 = "torch_c.from_builtin_tensor"(%17468) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17470 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17471 = "torch_c.from_builtin_tensor"(%17470) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17472 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17473 = "torch_c.from_builtin_tensor"(%17472) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17474 = "util.global.load"() <{global = @"__auto.blk.7.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17475 = "torch_c.from_builtin_tensor"(%17474) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17476 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17477 = "torch_c.from_builtin_tensor"(%17476) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17478 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17479 = "torch_c.from_builtin_tensor"(%17478) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17480 = "util.global.load"() <{global = @"__auto.blk.7.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17481 = "torch_c.from_builtin_tensor"(%17480) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17482 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17483 = "torch_c.from_builtin_tensor"(%17482) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17484 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17485 = "torch_c.from_builtin_tensor"(%17484) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17486 = "util.global.load"() <{global = @"__auto.blk.7.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17487 = "torch_c.from_builtin_tensor"(%17486) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17488 = "util.global.load"() <{global = @__auto.blk.7.attn_scale}> : () -> tensor<f32>
    %17489 = "torch_c.from_builtin_tensor"(%17488) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17490 = "util.global.load"() <{global = @"__auto.blk.7.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17491 = "torch_c.from_builtin_tensor"(%17490) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17492 = "util.global.load"() <{global = @"__auto.blk.7.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17493 = "torch_c.from_builtin_tensor"(%17492) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17494 = "util.global.load"() <{global = @__auto.blk.7.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17495 = "torch_c.from_builtin_tensor"(%17494) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17496 = "util.global.load"() <{global = @"__auto.blk.7.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17497 = "torch_c.from_builtin_tensor"(%17496) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17498 = "util.global.load"() <{global = @"__auto.blk.7.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17499 = "torch_c.from_builtin_tensor"(%17498) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17500 = "util.global.load"() <{global = @"__auto.blk.7.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17501 = "torch_c.from_builtin_tensor"(%17500) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17502 = "util.global.load"() <{global = @"__auto.blk.7.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17503 = "torch_c.from_builtin_tensor"(%17502) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17504 = "util.global.load"() <{global = @"__auto.blk.7.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17505 = "torch_c.from_builtin_tensor"(%17504) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17506 = "util.global.load"() <{global = @"__auto.blk.7.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17507 = "torch_c.from_builtin_tensor"(%17506) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17508 = "util.global.load"() <{global = @__auto.blk.8.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17509 = "torch_c.from_builtin_tensor"(%17508) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17510 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17511 = "torch_c.from_builtin_tensor"(%17510) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17512 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17513 = "torch_c.from_builtin_tensor"(%17512) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17514 = "util.global.load"() <{global = @"__auto.blk.8.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17515 = "torch_c.from_builtin_tensor"(%17514) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17516 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17517 = "torch_c.from_builtin_tensor"(%17516) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17518 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17519 = "torch_c.from_builtin_tensor"(%17518) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17520 = "util.global.load"() <{global = @"__auto.blk.8.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17521 = "torch_c.from_builtin_tensor"(%17520) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17522 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17523 = "torch_c.from_builtin_tensor"(%17522) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17524 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17525 = "torch_c.from_builtin_tensor"(%17524) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17526 = "util.global.load"() <{global = @"__auto.blk.8.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17527 = "torch_c.from_builtin_tensor"(%17526) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17528 = "util.global.load"() <{global = @__auto.blk.8.attn_scale}> : () -> tensor<f32>
    %17529 = "torch_c.from_builtin_tensor"(%17528) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17530 = "util.global.load"() <{global = @"__auto.blk.8.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17531 = "torch_c.from_builtin_tensor"(%17530) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17532 = "util.global.load"() <{global = @"__auto.blk.8.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17533 = "torch_c.from_builtin_tensor"(%17532) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17534 = "util.global.load"() <{global = @__auto.blk.8.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17535 = "torch_c.from_builtin_tensor"(%17534) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17536 = "util.global.load"() <{global = @"__auto.blk.8.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17537 = "torch_c.from_builtin_tensor"(%17536) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17538 = "util.global.load"() <{global = @"__auto.blk.8.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17539 = "torch_c.from_builtin_tensor"(%17538) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17540 = "util.global.load"() <{global = @"__auto.blk.8.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17541 = "torch_c.from_builtin_tensor"(%17540) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17542 = "util.global.load"() <{global = @"__auto.blk.8.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17543 = "torch_c.from_builtin_tensor"(%17542) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17544 = "util.global.load"() <{global = @"__auto.blk.8.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17545 = "torch_c.from_builtin_tensor"(%17544) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17546 = "util.global.load"() <{global = @"__auto.blk.8.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17547 = "torch_c.from_builtin_tensor"(%17546) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17548 = "util.global.load"() <{global = @__auto.blk.9.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17549 = "torch_c.from_builtin_tensor"(%17548) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17550 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17551 = "torch_c.from_builtin_tensor"(%17550) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17552 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17553 = "torch_c.from_builtin_tensor"(%17552) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17554 = "util.global.load"() <{global = @"__auto.blk.9.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17555 = "torch_c.from_builtin_tensor"(%17554) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17556 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17557 = "torch_c.from_builtin_tensor"(%17556) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17558 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17559 = "torch_c.from_builtin_tensor"(%17558) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17560 = "util.global.load"() <{global = @"__auto.blk.9.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17561 = "torch_c.from_builtin_tensor"(%17560) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17562 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17563 = "torch_c.from_builtin_tensor"(%17562) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17564 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17565 = "torch_c.from_builtin_tensor"(%17564) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17566 = "util.global.load"() <{global = @"__auto.blk.9.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17567 = "torch_c.from_builtin_tensor"(%17566) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17568 = "util.global.load"() <{global = @__auto.blk.9.attn_scale}> : () -> tensor<f32>
    %17569 = "torch_c.from_builtin_tensor"(%17568) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17570 = "util.global.load"() <{global = @"__auto.blk.9.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17571 = "torch_c.from_builtin_tensor"(%17570) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17572 = "util.global.load"() <{global = @"__auto.blk.9.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17573 = "torch_c.from_builtin_tensor"(%17572) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17574 = "util.global.load"() <{global = @__auto.blk.9.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17575 = "torch_c.from_builtin_tensor"(%17574) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17576 = "util.global.load"() <{global = @"__auto.blk.9.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17577 = "torch_c.from_builtin_tensor"(%17576) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17578 = "util.global.load"() <{global = @"__auto.blk.9.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17579 = "torch_c.from_builtin_tensor"(%17578) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17580 = "util.global.load"() <{global = @"__auto.blk.9.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17581 = "torch_c.from_builtin_tensor"(%17580) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17582 = "util.global.load"() <{global = @"__auto.blk.9.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17583 = "torch_c.from_builtin_tensor"(%17582) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17584 = "util.global.load"() <{global = @"__auto.blk.9.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17585 = "torch_c.from_builtin_tensor"(%17584) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17586 = "util.global.load"() <{global = @"__auto.blk.9.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17587 = "torch_c.from_builtin_tensor"(%17586) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17588 = "util.global.load"() <{global = @__auto.blk.10.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17589 = "torch_c.from_builtin_tensor"(%17588) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17590 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17591 = "torch_c.from_builtin_tensor"(%17590) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17592 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17593 = "torch_c.from_builtin_tensor"(%17592) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17594 = "util.global.load"() <{global = @"__auto.blk.10.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17595 = "torch_c.from_builtin_tensor"(%17594) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17596 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17597 = "torch_c.from_builtin_tensor"(%17596) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17598 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17599 = "torch_c.from_builtin_tensor"(%17598) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17600 = "util.global.load"() <{global = @"__auto.blk.10.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17601 = "torch_c.from_builtin_tensor"(%17600) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17602 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17603 = "torch_c.from_builtin_tensor"(%17602) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17604 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17605 = "torch_c.from_builtin_tensor"(%17604) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17606 = "util.global.load"() <{global = @"__auto.blk.10.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17607 = "torch_c.from_builtin_tensor"(%17606) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17608 = "util.global.load"() <{global = @__auto.blk.10.attn_scale}> : () -> tensor<f32>
    %17609 = "torch_c.from_builtin_tensor"(%17608) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17610 = "util.global.load"() <{global = @"__auto.blk.10.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17611 = "torch_c.from_builtin_tensor"(%17610) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17612 = "util.global.load"() <{global = @"__auto.blk.10.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17613 = "torch_c.from_builtin_tensor"(%17612) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17614 = "util.global.load"() <{global = @__auto.blk.10.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17615 = "torch_c.from_builtin_tensor"(%17614) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17616 = "util.global.load"() <{global = @"__auto.blk.10.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17617 = "torch_c.from_builtin_tensor"(%17616) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17618 = "util.global.load"() <{global = @"__auto.blk.10.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17619 = "torch_c.from_builtin_tensor"(%17618) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17620 = "util.global.load"() <{global = @"__auto.blk.10.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17621 = "torch_c.from_builtin_tensor"(%17620) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17622 = "util.global.load"() <{global = @"__auto.blk.10.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17623 = "torch_c.from_builtin_tensor"(%17622) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17624 = "util.global.load"() <{global = @"__auto.blk.10.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17625 = "torch_c.from_builtin_tensor"(%17624) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17626 = "util.global.load"() <{global = @"__auto.blk.10.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17627 = "torch_c.from_builtin_tensor"(%17626) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17628 = "util.global.load"() <{global = @__auto.blk.11.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17629 = "torch_c.from_builtin_tensor"(%17628) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17630 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17631 = "torch_c.from_builtin_tensor"(%17630) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17632 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17633 = "torch_c.from_builtin_tensor"(%17632) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17634 = "util.global.load"() <{global = @"__auto.blk.11.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17635 = "torch_c.from_builtin_tensor"(%17634) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17636 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17637 = "torch_c.from_builtin_tensor"(%17636) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17638 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17639 = "torch_c.from_builtin_tensor"(%17638) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17640 = "util.global.load"() <{global = @"__auto.blk.11.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17641 = "torch_c.from_builtin_tensor"(%17640) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17642 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17643 = "torch_c.from_builtin_tensor"(%17642) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17644 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17645 = "torch_c.from_builtin_tensor"(%17644) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17646 = "util.global.load"() <{global = @"__auto.blk.11.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17647 = "torch_c.from_builtin_tensor"(%17646) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17648 = "util.global.load"() <{global = @__auto.blk.11.attn_scale}> : () -> tensor<f32>
    %17649 = "torch_c.from_builtin_tensor"(%17648) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17650 = "util.global.load"() <{global = @"__auto.blk.11.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17651 = "torch_c.from_builtin_tensor"(%17650) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17652 = "util.global.load"() <{global = @"__auto.blk.11.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17653 = "torch_c.from_builtin_tensor"(%17652) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17654 = "util.global.load"() <{global = @__auto.blk.11.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17655 = "torch_c.from_builtin_tensor"(%17654) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17656 = "util.global.load"() <{global = @"__auto.blk.11.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17657 = "torch_c.from_builtin_tensor"(%17656) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17658 = "util.global.load"() <{global = @"__auto.blk.11.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17659 = "torch_c.from_builtin_tensor"(%17658) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17660 = "util.global.load"() <{global = @"__auto.blk.11.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17661 = "torch_c.from_builtin_tensor"(%17660) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17662 = "util.global.load"() <{global = @"__auto.blk.11.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17663 = "torch_c.from_builtin_tensor"(%17662) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17664 = "util.global.load"() <{global = @"__auto.blk.11.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17665 = "torch_c.from_builtin_tensor"(%17664) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17666 = "util.global.load"() <{global = @"__auto.blk.11.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17667 = "torch_c.from_builtin_tensor"(%17666) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17668 = "util.global.load"() <{global = @__auto.blk.12.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17669 = "torch_c.from_builtin_tensor"(%17668) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17670 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17671 = "torch_c.from_builtin_tensor"(%17670) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17672 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17673 = "torch_c.from_builtin_tensor"(%17672) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17674 = "util.global.load"() <{global = @"__auto.blk.12.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17675 = "torch_c.from_builtin_tensor"(%17674) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17676 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17677 = "torch_c.from_builtin_tensor"(%17676) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17678 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17679 = "torch_c.from_builtin_tensor"(%17678) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17680 = "util.global.load"() <{global = @"__auto.blk.12.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17681 = "torch_c.from_builtin_tensor"(%17680) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17682 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17683 = "torch_c.from_builtin_tensor"(%17682) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17684 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17685 = "torch_c.from_builtin_tensor"(%17684) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17686 = "util.global.load"() <{global = @"__auto.blk.12.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17687 = "torch_c.from_builtin_tensor"(%17686) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17688 = "util.global.load"() <{global = @__auto.blk.12.attn_scale}> : () -> tensor<f32>
    %17689 = "torch_c.from_builtin_tensor"(%17688) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17690 = "util.global.load"() <{global = @"__auto.blk.12.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17691 = "torch_c.from_builtin_tensor"(%17690) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17692 = "util.global.load"() <{global = @"__auto.blk.12.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17693 = "torch_c.from_builtin_tensor"(%17692) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17694 = "util.global.load"() <{global = @__auto.blk.12.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17695 = "torch_c.from_builtin_tensor"(%17694) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17696 = "util.global.load"() <{global = @"__auto.blk.12.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17697 = "torch_c.from_builtin_tensor"(%17696) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17698 = "util.global.load"() <{global = @"__auto.blk.12.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17699 = "torch_c.from_builtin_tensor"(%17698) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17700 = "util.global.load"() <{global = @"__auto.blk.12.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17701 = "torch_c.from_builtin_tensor"(%17700) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17702 = "util.global.load"() <{global = @"__auto.blk.12.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17703 = "torch_c.from_builtin_tensor"(%17702) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17704 = "util.global.load"() <{global = @"__auto.blk.12.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17705 = "torch_c.from_builtin_tensor"(%17704) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17706 = "util.global.load"() <{global = @"__auto.blk.12.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17707 = "torch_c.from_builtin_tensor"(%17706) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17708 = "util.global.load"() <{global = @__auto.blk.13.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17709 = "torch_c.from_builtin_tensor"(%17708) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17710 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17711 = "torch_c.from_builtin_tensor"(%17710) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17712 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17713 = "torch_c.from_builtin_tensor"(%17712) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17714 = "util.global.load"() <{global = @"__auto.blk.13.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17715 = "torch_c.from_builtin_tensor"(%17714) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17716 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17717 = "torch_c.from_builtin_tensor"(%17716) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17718 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17719 = "torch_c.from_builtin_tensor"(%17718) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17720 = "util.global.load"() <{global = @"__auto.blk.13.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17721 = "torch_c.from_builtin_tensor"(%17720) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17722 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17723 = "torch_c.from_builtin_tensor"(%17722) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17724 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17725 = "torch_c.from_builtin_tensor"(%17724) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17726 = "util.global.load"() <{global = @"__auto.blk.13.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17727 = "torch_c.from_builtin_tensor"(%17726) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17728 = "util.global.load"() <{global = @__auto.blk.13.attn_scale}> : () -> tensor<f32>
    %17729 = "torch_c.from_builtin_tensor"(%17728) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17730 = "util.global.load"() <{global = @"__auto.blk.13.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17731 = "torch_c.from_builtin_tensor"(%17730) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17732 = "util.global.load"() <{global = @"__auto.blk.13.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17733 = "torch_c.from_builtin_tensor"(%17732) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17734 = "util.global.load"() <{global = @__auto.blk.13.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17735 = "torch_c.from_builtin_tensor"(%17734) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17736 = "util.global.load"() <{global = @"__auto.blk.13.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17737 = "torch_c.from_builtin_tensor"(%17736) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17738 = "util.global.load"() <{global = @"__auto.blk.13.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17739 = "torch_c.from_builtin_tensor"(%17738) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17740 = "util.global.load"() <{global = @"__auto.blk.13.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17741 = "torch_c.from_builtin_tensor"(%17740) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17742 = "util.global.load"() <{global = @"__auto.blk.13.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17743 = "torch_c.from_builtin_tensor"(%17742) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17744 = "util.global.load"() <{global = @"__auto.blk.13.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17745 = "torch_c.from_builtin_tensor"(%17744) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17746 = "util.global.load"() <{global = @"__auto.blk.13.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17747 = "torch_c.from_builtin_tensor"(%17746) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17748 = "util.global.load"() <{global = @__auto.blk.14.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17749 = "torch_c.from_builtin_tensor"(%17748) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17750 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17751 = "torch_c.from_builtin_tensor"(%17750) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17752 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17753 = "torch_c.from_builtin_tensor"(%17752) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17754 = "util.global.load"() <{global = @"__auto.blk.14.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17755 = "torch_c.from_builtin_tensor"(%17754) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17756 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17757 = "torch_c.from_builtin_tensor"(%17756) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17758 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17759 = "torch_c.from_builtin_tensor"(%17758) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17760 = "util.global.load"() <{global = @"__auto.blk.14.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17761 = "torch_c.from_builtin_tensor"(%17760) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17762 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17763 = "torch_c.from_builtin_tensor"(%17762) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17764 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17765 = "torch_c.from_builtin_tensor"(%17764) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17766 = "util.global.load"() <{global = @"__auto.blk.14.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17767 = "torch_c.from_builtin_tensor"(%17766) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17768 = "util.global.load"() <{global = @__auto.blk.14.attn_scale}> : () -> tensor<f32>
    %17769 = "torch_c.from_builtin_tensor"(%17768) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17770 = "util.global.load"() <{global = @"__auto.blk.14.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17771 = "torch_c.from_builtin_tensor"(%17770) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17772 = "util.global.load"() <{global = @"__auto.blk.14.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17773 = "torch_c.from_builtin_tensor"(%17772) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17774 = "util.global.load"() <{global = @__auto.blk.14.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17775 = "torch_c.from_builtin_tensor"(%17774) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17776 = "util.global.load"() <{global = @"__auto.blk.14.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17777 = "torch_c.from_builtin_tensor"(%17776) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17778 = "util.global.load"() <{global = @"__auto.blk.14.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17779 = "torch_c.from_builtin_tensor"(%17778) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17780 = "util.global.load"() <{global = @"__auto.blk.14.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17781 = "torch_c.from_builtin_tensor"(%17780) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17782 = "util.global.load"() <{global = @"__auto.blk.14.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17783 = "torch_c.from_builtin_tensor"(%17782) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17784 = "util.global.load"() <{global = @"__auto.blk.14.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17785 = "torch_c.from_builtin_tensor"(%17784) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17786 = "util.global.load"() <{global = @"__auto.blk.14.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17787 = "torch_c.from_builtin_tensor"(%17786) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17788 = "util.global.load"() <{global = @__auto.blk.15.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17789 = "torch_c.from_builtin_tensor"(%17788) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17790 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17791 = "torch_c.from_builtin_tensor"(%17790) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17792 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17793 = "torch_c.from_builtin_tensor"(%17792) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17794 = "util.global.load"() <{global = @"__auto.blk.15.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17795 = "torch_c.from_builtin_tensor"(%17794) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17796 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17797 = "torch_c.from_builtin_tensor"(%17796) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17798 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17799 = "torch_c.from_builtin_tensor"(%17798) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17800 = "util.global.load"() <{global = @"__auto.blk.15.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17801 = "torch_c.from_builtin_tensor"(%17800) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17802 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17803 = "torch_c.from_builtin_tensor"(%17802) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17804 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17805 = "torch_c.from_builtin_tensor"(%17804) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17806 = "util.global.load"() <{global = @"__auto.blk.15.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17807 = "torch_c.from_builtin_tensor"(%17806) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17808 = "util.global.load"() <{global = @__auto.blk.15.attn_scale}> : () -> tensor<f32>
    %17809 = "torch_c.from_builtin_tensor"(%17808) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17810 = "util.global.load"() <{global = @"__auto.blk.15.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17811 = "torch_c.from_builtin_tensor"(%17810) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17812 = "util.global.load"() <{global = @"__auto.blk.15.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17813 = "torch_c.from_builtin_tensor"(%17812) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17814 = "util.global.load"() <{global = @__auto.blk.15.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17815 = "torch_c.from_builtin_tensor"(%17814) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17816 = "util.global.load"() <{global = @"__auto.blk.15.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17817 = "torch_c.from_builtin_tensor"(%17816) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17818 = "util.global.load"() <{global = @"__auto.blk.15.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17819 = "torch_c.from_builtin_tensor"(%17818) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17820 = "util.global.load"() <{global = @"__auto.blk.15.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17821 = "torch_c.from_builtin_tensor"(%17820) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17822 = "util.global.load"() <{global = @"__auto.blk.15.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17823 = "torch_c.from_builtin_tensor"(%17822) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17824 = "util.global.load"() <{global = @"__auto.blk.15.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17825 = "torch_c.from_builtin_tensor"(%17824) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17826 = "util.global.load"() <{global = @"__auto.blk.15.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17827 = "torch_c.from_builtin_tensor"(%17826) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17828 = "util.global.load"() <{global = @__auto.blk.16.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17829 = "torch_c.from_builtin_tensor"(%17828) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17830 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17831 = "torch_c.from_builtin_tensor"(%17830) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17832 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17833 = "torch_c.from_builtin_tensor"(%17832) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17834 = "util.global.load"() <{global = @"__auto.blk.16.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17835 = "torch_c.from_builtin_tensor"(%17834) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17836 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17837 = "torch_c.from_builtin_tensor"(%17836) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17838 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17839 = "torch_c.from_builtin_tensor"(%17838) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17840 = "util.global.load"() <{global = @"__auto.blk.16.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17841 = "torch_c.from_builtin_tensor"(%17840) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17842 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17843 = "torch_c.from_builtin_tensor"(%17842) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17844 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17845 = "torch_c.from_builtin_tensor"(%17844) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17846 = "util.global.load"() <{global = @"__auto.blk.16.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17847 = "torch_c.from_builtin_tensor"(%17846) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17848 = "util.global.load"() <{global = @__auto.blk.16.attn_scale}> : () -> tensor<f32>
    %17849 = "torch_c.from_builtin_tensor"(%17848) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17850 = "util.global.load"() <{global = @"__auto.blk.16.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17851 = "torch_c.from_builtin_tensor"(%17850) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17852 = "util.global.load"() <{global = @"__auto.blk.16.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17853 = "torch_c.from_builtin_tensor"(%17852) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17854 = "util.global.load"() <{global = @__auto.blk.16.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17855 = "torch_c.from_builtin_tensor"(%17854) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17856 = "util.global.load"() <{global = @"__auto.blk.16.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17857 = "torch_c.from_builtin_tensor"(%17856) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17858 = "util.global.load"() <{global = @"__auto.blk.16.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17859 = "torch_c.from_builtin_tensor"(%17858) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17860 = "util.global.load"() <{global = @"__auto.blk.16.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17861 = "torch_c.from_builtin_tensor"(%17860) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17862 = "util.global.load"() <{global = @"__auto.blk.16.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17863 = "torch_c.from_builtin_tensor"(%17862) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17864 = "util.global.load"() <{global = @"__auto.blk.16.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17865 = "torch_c.from_builtin_tensor"(%17864) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17866 = "util.global.load"() <{global = @"__auto.blk.16.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17867 = "torch_c.from_builtin_tensor"(%17866) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17868 = "util.global.load"() <{global = @__auto.blk.17.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17869 = "torch_c.from_builtin_tensor"(%17868) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17870 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17871 = "torch_c.from_builtin_tensor"(%17870) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17872 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17873 = "torch_c.from_builtin_tensor"(%17872) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17874 = "util.global.load"() <{global = @"__auto.blk.17.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17875 = "torch_c.from_builtin_tensor"(%17874) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17876 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17877 = "torch_c.from_builtin_tensor"(%17876) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17878 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17879 = "torch_c.from_builtin_tensor"(%17878) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17880 = "util.global.load"() <{global = @"__auto.blk.17.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17881 = "torch_c.from_builtin_tensor"(%17880) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17882 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17883 = "torch_c.from_builtin_tensor"(%17882) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17884 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17885 = "torch_c.from_builtin_tensor"(%17884) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17886 = "util.global.load"() <{global = @"__auto.blk.17.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17887 = "torch_c.from_builtin_tensor"(%17886) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17888 = "util.global.load"() <{global = @__auto.blk.17.attn_scale}> : () -> tensor<f32>
    %17889 = "torch_c.from_builtin_tensor"(%17888) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17890 = "util.global.load"() <{global = @"__auto.blk.17.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17891 = "torch_c.from_builtin_tensor"(%17890) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17892 = "util.global.load"() <{global = @"__auto.blk.17.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17893 = "torch_c.from_builtin_tensor"(%17892) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17894 = "util.global.load"() <{global = @__auto.blk.17.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17895 = "torch_c.from_builtin_tensor"(%17894) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17896 = "util.global.load"() <{global = @"__auto.blk.17.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17897 = "torch_c.from_builtin_tensor"(%17896) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17898 = "util.global.load"() <{global = @"__auto.blk.17.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17899 = "torch_c.from_builtin_tensor"(%17898) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17900 = "util.global.load"() <{global = @"__auto.blk.17.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17901 = "torch_c.from_builtin_tensor"(%17900) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17902 = "util.global.load"() <{global = @"__auto.blk.17.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17903 = "torch_c.from_builtin_tensor"(%17902) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17904 = "util.global.load"() <{global = @"__auto.blk.17.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17905 = "torch_c.from_builtin_tensor"(%17904) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17906 = "util.global.load"() <{global = @"__auto.blk.17.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17907 = "torch_c.from_builtin_tensor"(%17906) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17908 = "util.global.load"() <{global = @__auto.blk.18.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17909 = "torch_c.from_builtin_tensor"(%17908) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17910 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17911 = "torch_c.from_builtin_tensor"(%17910) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17912 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17913 = "torch_c.from_builtin_tensor"(%17912) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17914 = "util.global.load"() <{global = @"__auto.blk.18.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17915 = "torch_c.from_builtin_tensor"(%17914) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17916 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17917 = "torch_c.from_builtin_tensor"(%17916) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17918 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17919 = "torch_c.from_builtin_tensor"(%17918) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17920 = "util.global.load"() <{global = @"__auto.blk.18.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17921 = "torch_c.from_builtin_tensor"(%17920) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17922 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17923 = "torch_c.from_builtin_tensor"(%17922) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17924 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17925 = "torch_c.from_builtin_tensor"(%17924) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17926 = "util.global.load"() <{global = @"__auto.blk.18.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17927 = "torch_c.from_builtin_tensor"(%17926) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17928 = "util.global.load"() <{global = @__auto.blk.18.attn_scale}> : () -> tensor<f32>
    %17929 = "torch_c.from_builtin_tensor"(%17928) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17930 = "util.global.load"() <{global = @"__auto.blk.18.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17931 = "torch_c.from_builtin_tensor"(%17930) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17932 = "util.global.load"() <{global = @"__auto.blk.18.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17933 = "torch_c.from_builtin_tensor"(%17932) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17934 = "util.global.load"() <{global = @__auto.blk.18.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17935 = "torch_c.from_builtin_tensor"(%17934) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17936 = "util.global.load"() <{global = @"__auto.blk.18.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17937 = "torch_c.from_builtin_tensor"(%17936) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17938 = "util.global.load"() <{global = @"__auto.blk.18.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17939 = "torch_c.from_builtin_tensor"(%17938) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17940 = "util.global.load"() <{global = @"__auto.blk.18.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17941 = "torch_c.from_builtin_tensor"(%17940) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17942 = "util.global.load"() <{global = @"__auto.blk.18.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17943 = "torch_c.from_builtin_tensor"(%17942) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17944 = "util.global.load"() <{global = @"__auto.blk.18.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17945 = "torch_c.from_builtin_tensor"(%17944) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17946 = "util.global.load"() <{global = @"__auto.blk.18.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17947 = "torch_c.from_builtin_tensor"(%17946) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17948 = "util.global.load"() <{global = @__auto.blk.19.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17949 = "torch_c.from_builtin_tensor"(%17948) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17950 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17951 = "torch_c.from_builtin_tensor"(%17950) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17952 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17953 = "torch_c.from_builtin_tensor"(%17952) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17954 = "util.global.load"() <{global = @"__auto.blk.19.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17955 = "torch_c.from_builtin_tensor"(%17954) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17956 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17957 = "torch_c.from_builtin_tensor"(%17956) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17958 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17959 = "torch_c.from_builtin_tensor"(%17958) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17960 = "util.global.load"() <{global = @"__auto.blk.19.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %17961 = "torch_c.from_builtin_tensor"(%17960) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17962 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %17963 = "torch_c.from_builtin_tensor"(%17962) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17964 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17965 = "torch_c.from_builtin_tensor"(%17964) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %17966 = "util.global.load"() <{global = @"__auto.blk.19.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %17967 = "torch_c.from_builtin_tensor"(%17966) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17968 = "util.global.load"() <{global = @__auto.blk.19.attn_scale}> : () -> tensor<f32>
    %17969 = "torch_c.from_builtin_tensor"(%17968) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17970 = "util.global.load"() <{global = @"__auto.blk.19.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %17971 = "torch_c.from_builtin_tensor"(%17970) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17972 = "util.global.load"() <{global = @"__auto.blk.19.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17973 = "torch_c.from_builtin_tensor"(%17972) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17974 = "util.global.load"() <{global = @__auto.blk.19.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %17975 = "torch_c.from_builtin_tensor"(%17974) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17976 = "util.global.load"() <{global = @"__auto.blk.19.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %17977 = "torch_c.from_builtin_tensor"(%17976) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17978 = "util.global.load"() <{global = @"__auto.blk.19.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17979 = "torch_c.from_builtin_tensor"(%17978) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17980 = "util.global.load"() <{global = @"__auto.blk.19.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %17981 = "torch_c.from_builtin_tensor"(%17980) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17982 = "util.global.load"() <{global = @"__auto.blk.19.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %17983 = "torch_c.from_builtin_tensor"(%17982) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %17984 = "util.global.load"() <{global = @"__auto.blk.19.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %17985 = "torch_c.from_builtin_tensor"(%17984) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17986 = "util.global.load"() <{global = @"__auto.blk.19.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %17987 = "torch_c.from_builtin_tensor"(%17986) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %17988 = "util.global.load"() <{global = @__auto.blk.20.attn_norm.weight}> : () -> tensor<4096xbf16>
    %17989 = "torch_c.from_builtin_tensor"(%17988) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %17990 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %17991 = "torch_c.from_builtin_tensor"(%17990) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17992 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %17993 = "torch_c.from_builtin_tensor"(%17992) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %17994 = "util.global.load"() <{global = @"__auto.blk.20.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %17995 = "torch_c.from_builtin_tensor"(%17994) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17996 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %17997 = "torch_c.from_builtin_tensor"(%17996) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %17998 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %17999 = "torch_c.from_builtin_tensor"(%17998) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18000 = "util.global.load"() <{global = @"__auto.blk.20.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18001 = "torch_c.from_builtin_tensor"(%18000) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18002 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18003 = "torch_c.from_builtin_tensor"(%18002) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18004 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18005 = "torch_c.from_builtin_tensor"(%18004) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18006 = "util.global.load"() <{global = @"__auto.blk.20.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18007 = "torch_c.from_builtin_tensor"(%18006) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18008 = "util.global.load"() <{global = @__auto.blk.20.attn_scale}> : () -> tensor<f32>
    %18009 = "torch_c.from_builtin_tensor"(%18008) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18010 = "util.global.load"() <{global = @"__auto.blk.20.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18011 = "torch_c.from_builtin_tensor"(%18010) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18012 = "util.global.load"() <{global = @"__auto.blk.20.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18013 = "torch_c.from_builtin_tensor"(%18012) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18014 = "util.global.load"() <{global = @__auto.blk.20.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18015 = "torch_c.from_builtin_tensor"(%18014) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18016 = "util.global.load"() <{global = @"__auto.blk.20.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18017 = "torch_c.from_builtin_tensor"(%18016) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18018 = "util.global.load"() <{global = @"__auto.blk.20.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18019 = "torch_c.from_builtin_tensor"(%18018) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18020 = "util.global.load"() <{global = @"__auto.blk.20.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18021 = "torch_c.from_builtin_tensor"(%18020) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18022 = "util.global.load"() <{global = @"__auto.blk.20.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18023 = "torch_c.from_builtin_tensor"(%18022) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18024 = "util.global.load"() <{global = @"__auto.blk.20.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18025 = "torch_c.from_builtin_tensor"(%18024) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18026 = "util.global.load"() <{global = @"__auto.blk.20.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18027 = "torch_c.from_builtin_tensor"(%18026) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18028 = "util.global.load"() <{global = @__auto.blk.21.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18029 = "torch_c.from_builtin_tensor"(%18028) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18030 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18031 = "torch_c.from_builtin_tensor"(%18030) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18032 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18033 = "torch_c.from_builtin_tensor"(%18032) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18034 = "util.global.load"() <{global = @"__auto.blk.21.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18035 = "torch_c.from_builtin_tensor"(%18034) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18036 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18037 = "torch_c.from_builtin_tensor"(%18036) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18038 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18039 = "torch_c.from_builtin_tensor"(%18038) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18040 = "util.global.load"() <{global = @"__auto.blk.21.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18041 = "torch_c.from_builtin_tensor"(%18040) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18042 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18043 = "torch_c.from_builtin_tensor"(%18042) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18044 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18045 = "torch_c.from_builtin_tensor"(%18044) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18046 = "util.global.load"() <{global = @"__auto.blk.21.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18047 = "torch_c.from_builtin_tensor"(%18046) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18048 = "util.global.load"() <{global = @__auto.blk.21.attn_scale}> : () -> tensor<f32>
    %18049 = "torch_c.from_builtin_tensor"(%18048) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18050 = "util.global.load"() <{global = @"__auto.blk.21.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18051 = "torch_c.from_builtin_tensor"(%18050) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18052 = "util.global.load"() <{global = @"__auto.blk.21.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18053 = "torch_c.from_builtin_tensor"(%18052) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18054 = "util.global.load"() <{global = @__auto.blk.21.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18055 = "torch_c.from_builtin_tensor"(%18054) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18056 = "util.global.load"() <{global = @"__auto.blk.21.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18057 = "torch_c.from_builtin_tensor"(%18056) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18058 = "util.global.load"() <{global = @"__auto.blk.21.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18059 = "torch_c.from_builtin_tensor"(%18058) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18060 = "util.global.load"() <{global = @"__auto.blk.21.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18061 = "torch_c.from_builtin_tensor"(%18060) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18062 = "util.global.load"() <{global = @"__auto.blk.21.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18063 = "torch_c.from_builtin_tensor"(%18062) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18064 = "util.global.load"() <{global = @"__auto.blk.21.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18065 = "torch_c.from_builtin_tensor"(%18064) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18066 = "util.global.load"() <{global = @"__auto.blk.21.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18067 = "torch_c.from_builtin_tensor"(%18066) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18068 = "util.global.load"() <{global = @__auto.blk.22.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18069 = "torch_c.from_builtin_tensor"(%18068) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18070 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18071 = "torch_c.from_builtin_tensor"(%18070) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18072 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18073 = "torch_c.from_builtin_tensor"(%18072) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18074 = "util.global.load"() <{global = @"__auto.blk.22.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18075 = "torch_c.from_builtin_tensor"(%18074) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18076 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18077 = "torch_c.from_builtin_tensor"(%18076) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18078 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18079 = "torch_c.from_builtin_tensor"(%18078) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18080 = "util.global.load"() <{global = @"__auto.blk.22.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18081 = "torch_c.from_builtin_tensor"(%18080) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18082 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18083 = "torch_c.from_builtin_tensor"(%18082) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18084 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18085 = "torch_c.from_builtin_tensor"(%18084) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18086 = "util.global.load"() <{global = @"__auto.blk.22.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18087 = "torch_c.from_builtin_tensor"(%18086) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18088 = "util.global.load"() <{global = @__auto.blk.22.attn_scale}> : () -> tensor<f32>
    %18089 = "torch_c.from_builtin_tensor"(%18088) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18090 = "util.global.load"() <{global = @"__auto.blk.22.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18091 = "torch_c.from_builtin_tensor"(%18090) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18092 = "util.global.load"() <{global = @"__auto.blk.22.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18093 = "torch_c.from_builtin_tensor"(%18092) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18094 = "util.global.load"() <{global = @__auto.blk.22.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18095 = "torch_c.from_builtin_tensor"(%18094) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18096 = "util.global.load"() <{global = @"__auto.blk.22.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18097 = "torch_c.from_builtin_tensor"(%18096) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18098 = "util.global.load"() <{global = @"__auto.blk.22.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18099 = "torch_c.from_builtin_tensor"(%18098) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18100 = "util.global.load"() <{global = @"__auto.blk.22.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18101 = "torch_c.from_builtin_tensor"(%18100) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18102 = "util.global.load"() <{global = @"__auto.blk.22.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18103 = "torch_c.from_builtin_tensor"(%18102) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18104 = "util.global.load"() <{global = @"__auto.blk.22.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18105 = "torch_c.from_builtin_tensor"(%18104) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18106 = "util.global.load"() <{global = @"__auto.blk.22.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18107 = "torch_c.from_builtin_tensor"(%18106) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18108 = "util.global.load"() <{global = @__auto.blk.23.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18109 = "torch_c.from_builtin_tensor"(%18108) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18110 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18111 = "torch_c.from_builtin_tensor"(%18110) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18112 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18113 = "torch_c.from_builtin_tensor"(%18112) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18114 = "util.global.load"() <{global = @"__auto.blk.23.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18115 = "torch_c.from_builtin_tensor"(%18114) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18116 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18117 = "torch_c.from_builtin_tensor"(%18116) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18118 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18119 = "torch_c.from_builtin_tensor"(%18118) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18120 = "util.global.load"() <{global = @"__auto.blk.23.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18121 = "torch_c.from_builtin_tensor"(%18120) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18122 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18123 = "torch_c.from_builtin_tensor"(%18122) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18124 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18125 = "torch_c.from_builtin_tensor"(%18124) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18126 = "util.global.load"() <{global = @"__auto.blk.23.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18127 = "torch_c.from_builtin_tensor"(%18126) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18128 = "util.global.load"() <{global = @__auto.blk.23.attn_scale}> : () -> tensor<f32>
    %18129 = "torch_c.from_builtin_tensor"(%18128) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18130 = "util.global.load"() <{global = @"__auto.blk.23.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18131 = "torch_c.from_builtin_tensor"(%18130) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18132 = "util.global.load"() <{global = @"__auto.blk.23.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18133 = "torch_c.from_builtin_tensor"(%18132) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18134 = "util.global.load"() <{global = @__auto.blk.23.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18135 = "torch_c.from_builtin_tensor"(%18134) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18136 = "util.global.load"() <{global = @"__auto.blk.23.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18137 = "torch_c.from_builtin_tensor"(%18136) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18138 = "util.global.load"() <{global = @"__auto.blk.23.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18139 = "torch_c.from_builtin_tensor"(%18138) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18140 = "util.global.load"() <{global = @"__auto.blk.23.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18141 = "torch_c.from_builtin_tensor"(%18140) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18142 = "util.global.load"() <{global = @"__auto.blk.23.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18143 = "torch_c.from_builtin_tensor"(%18142) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18144 = "util.global.load"() <{global = @"__auto.blk.23.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18145 = "torch_c.from_builtin_tensor"(%18144) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18146 = "util.global.load"() <{global = @"__auto.blk.23.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18147 = "torch_c.from_builtin_tensor"(%18146) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18148 = "util.global.load"() <{global = @__auto.blk.24.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18149 = "torch_c.from_builtin_tensor"(%18148) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18150 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18151 = "torch_c.from_builtin_tensor"(%18150) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18152 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18153 = "torch_c.from_builtin_tensor"(%18152) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18154 = "util.global.load"() <{global = @"__auto.blk.24.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18155 = "torch_c.from_builtin_tensor"(%18154) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18156 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18157 = "torch_c.from_builtin_tensor"(%18156) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18158 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18159 = "torch_c.from_builtin_tensor"(%18158) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18160 = "util.global.load"() <{global = @"__auto.blk.24.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18161 = "torch_c.from_builtin_tensor"(%18160) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18162 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18163 = "torch_c.from_builtin_tensor"(%18162) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18164 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18165 = "torch_c.from_builtin_tensor"(%18164) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18166 = "util.global.load"() <{global = @"__auto.blk.24.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18167 = "torch_c.from_builtin_tensor"(%18166) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18168 = "util.global.load"() <{global = @__auto.blk.24.attn_scale}> : () -> tensor<f32>
    %18169 = "torch_c.from_builtin_tensor"(%18168) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18170 = "util.global.load"() <{global = @"__auto.blk.24.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18171 = "torch_c.from_builtin_tensor"(%18170) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18172 = "util.global.load"() <{global = @"__auto.blk.24.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18173 = "torch_c.from_builtin_tensor"(%18172) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18174 = "util.global.load"() <{global = @__auto.blk.24.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18175 = "torch_c.from_builtin_tensor"(%18174) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18176 = "util.global.load"() <{global = @"__auto.blk.24.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18177 = "torch_c.from_builtin_tensor"(%18176) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18178 = "util.global.load"() <{global = @"__auto.blk.24.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18179 = "torch_c.from_builtin_tensor"(%18178) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18180 = "util.global.load"() <{global = @"__auto.blk.24.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18181 = "torch_c.from_builtin_tensor"(%18180) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18182 = "util.global.load"() <{global = @"__auto.blk.24.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18183 = "torch_c.from_builtin_tensor"(%18182) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18184 = "util.global.load"() <{global = @"__auto.blk.24.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18185 = "torch_c.from_builtin_tensor"(%18184) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18186 = "util.global.load"() <{global = @"__auto.blk.24.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18187 = "torch_c.from_builtin_tensor"(%18186) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18188 = "util.global.load"() <{global = @__auto.blk.25.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18189 = "torch_c.from_builtin_tensor"(%18188) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18190 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18191 = "torch_c.from_builtin_tensor"(%18190) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18192 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18193 = "torch_c.from_builtin_tensor"(%18192) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18194 = "util.global.load"() <{global = @"__auto.blk.25.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18195 = "torch_c.from_builtin_tensor"(%18194) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18196 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18197 = "torch_c.from_builtin_tensor"(%18196) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18198 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18199 = "torch_c.from_builtin_tensor"(%18198) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18200 = "util.global.load"() <{global = @"__auto.blk.25.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18201 = "torch_c.from_builtin_tensor"(%18200) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18202 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18203 = "torch_c.from_builtin_tensor"(%18202) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18204 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18205 = "torch_c.from_builtin_tensor"(%18204) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18206 = "util.global.load"() <{global = @"__auto.blk.25.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18207 = "torch_c.from_builtin_tensor"(%18206) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18208 = "util.global.load"() <{global = @__auto.blk.25.attn_scale}> : () -> tensor<f32>
    %18209 = "torch_c.from_builtin_tensor"(%18208) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18210 = "util.global.load"() <{global = @"__auto.blk.25.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18211 = "torch_c.from_builtin_tensor"(%18210) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18212 = "util.global.load"() <{global = @"__auto.blk.25.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18213 = "torch_c.from_builtin_tensor"(%18212) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18214 = "util.global.load"() <{global = @__auto.blk.25.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18215 = "torch_c.from_builtin_tensor"(%18214) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18216 = "util.global.load"() <{global = @"__auto.blk.25.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18217 = "torch_c.from_builtin_tensor"(%18216) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18218 = "util.global.load"() <{global = @"__auto.blk.25.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18219 = "torch_c.from_builtin_tensor"(%18218) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18220 = "util.global.load"() <{global = @"__auto.blk.25.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18221 = "torch_c.from_builtin_tensor"(%18220) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18222 = "util.global.load"() <{global = @"__auto.blk.25.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18223 = "torch_c.from_builtin_tensor"(%18222) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18224 = "util.global.load"() <{global = @"__auto.blk.25.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18225 = "torch_c.from_builtin_tensor"(%18224) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18226 = "util.global.load"() <{global = @"__auto.blk.25.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18227 = "torch_c.from_builtin_tensor"(%18226) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18228 = "util.global.load"() <{global = @__auto.blk.26.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18229 = "torch_c.from_builtin_tensor"(%18228) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18230 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18231 = "torch_c.from_builtin_tensor"(%18230) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18232 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18233 = "torch_c.from_builtin_tensor"(%18232) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18234 = "util.global.load"() <{global = @"__auto.blk.26.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18235 = "torch_c.from_builtin_tensor"(%18234) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18236 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18237 = "torch_c.from_builtin_tensor"(%18236) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18238 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18239 = "torch_c.from_builtin_tensor"(%18238) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18240 = "util.global.load"() <{global = @"__auto.blk.26.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18241 = "torch_c.from_builtin_tensor"(%18240) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18242 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18243 = "torch_c.from_builtin_tensor"(%18242) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18244 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18245 = "torch_c.from_builtin_tensor"(%18244) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18246 = "util.global.load"() <{global = @"__auto.blk.26.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18247 = "torch_c.from_builtin_tensor"(%18246) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18248 = "util.global.load"() <{global = @__auto.blk.26.attn_scale}> : () -> tensor<f32>
    %18249 = "torch_c.from_builtin_tensor"(%18248) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18250 = "util.global.load"() <{global = @"__auto.blk.26.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18251 = "torch_c.from_builtin_tensor"(%18250) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18252 = "util.global.load"() <{global = @"__auto.blk.26.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18253 = "torch_c.from_builtin_tensor"(%18252) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18254 = "util.global.load"() <{global = @__auto.blk.26.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18255 = "torch_c.from_builtin_tensor"(%18254) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18256 = "util.global.load"() <{global = @"__auto.blk.26.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18257 = "torch_c.from_builtin_tensor"(%18256) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18258 = "util.global.load"() <{global = @"__auto.blk.26.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18259 = "torch_c.from_builtin_tensor"(%18258) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18260 = "util.global.load"() <{global = @"__auto.blk.26.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18261 = "torch_c.from_builtin_tensor"(%18260) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18262 = "util.global.load"() <{global = @"__auto.blk.26.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18263 = "torch_c.from_builtin_tensor"(%18262) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18264 = "util.global.load"() <{global = @"__auto.blk.26.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18265 = "torch_c.from_builtin_tensor"(%18264) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18266 = "util.global.load"() <{global = @"__auto.blk.26.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18267 = "torch_c.from_builtin_tensor"(%18266) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18268 = "util.global.load"() <{global = @__auto.blk.27.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18269 = "torch_c.from_builtin_tensor"(%18268) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18270 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18271 = "torch_c.from_builtin_tensor"(%18270) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18272 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18273 = "torch_c.from_builtin_tensor"(%18272) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18274 = "util.global.load"() <{global = @"__auto.blk.27.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18275 = "torch_c.from_builtin_tensor"(%18274) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18276 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18277 = "torch_c.from_builtin_tensor"(%18276) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18278 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18279 = "torch_c.from_builtin_tensor"(%18278) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18280 = "util.global.load"() <{global = @"__auto.blk.27.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18281 = "torch_c.from_builtin_tensor"(%18280) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18282 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18283 = "torch_c.from_builtin_tensor"(%18282) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18284 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18285 = "torch_c.from_builtin_tensor"(%18284) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18286 = "util.global.load"() <{global = @"__auto.blk.27.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18287 = "torch_c.from_builtin_tensor"(%18286) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18288 = "util.global.load"() <{global = @__auto.blk.27.attn_scale}> : () -> tensor<f32>
    %18289 = "torch_c.from_builtin_tensor"(%18288) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18290 = "util.global.load"() <{global = @"__auto.blk.27.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18291 = "torch_c.from_builtin_tensor"(%18290) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18292 = "util.global.load"() <{global = @"__auto.blk.27.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18293 = "torch_c.from_builtin_tensor"(%18292) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18294 = "util.global.load"() <{global = @__auto.blk.27.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18295 = "torch_c.from_builtin_tensor"(%18294) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18296 = "util.global.load"() <{global = @"__auto.blk.27.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18297 = "torch_c.from_builtin_tensor"(%18296) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18298 = "util.global.load"() <{global = @"__auto.blk.27.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18299 = "torch_c.from_builtin_tensor"(%18298) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18300 = "util.global.load"() <{global = @"__auto.blk.27.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18301 = "torch_c.from_builtin_tensor"(%18300) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18302 = "util.global.load"() <{global = @"__auto.blk.27.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18303 = "torch_c.from_builtin_tensor"(%18302) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18304 = "util.global.load"() <{global = @"__auto.blk.27.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18305 = "torch_c.from_builtin_tensor"(%18304) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18306 = "util.global.load"() <{global = @"__auto.blk.27.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18307 = "torch_c.from_builtin_tensor"(%18306) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18308 = "util.global.load"() <{global = @__auto.blk.28.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18309 = "torch_c.from_builtin_tensor"(%18308) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18310 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18311 = "torch_c.from_builtin_tensor"(%18310) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18312 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18313 = "torch_c.from_builtin_tensor"(%18312) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18314 = "util.global.load"() <{global = @"__auto.blk.28.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18315 = "torch_c.from_builtin_tensor"(%18314) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18316 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18317 = "torch_c.from_builtin_tensor"(%18316) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18318 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18319 = "torch_c.from_builtin_tensor"(%18318) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18320 = "util.global.load"() <{global = @"__auto.blk.28.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18321 = "torch_c.from_builtin_tensor"(%18320) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18322 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18323 = "torch_c.from_builtin_tensor"(%18322) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18324 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18325 = "torch_c.from_builtin_tensor"(%18324) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18326 = "util.global.load"() <{global = @"__auto.blk.28.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18327 = "torch_c.from_builtin_tensor"(%18326) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18328 = "util.global.load"() <{global = @__auto.blk.28.attn_scale}> : () -> tensor<f32>
    %18329 = "torch_c.from_builtin_tensor"(%18328) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18330 = "util.global.load"() <{global = @"__auto.blk.28.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18331 = "torch_c.from_builtin_tensor"(%18330) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18332 = "util.global.load"() <{global = @"__auto.blk.28.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18333 = "torch_c.from_builtin_tensor"(%18332) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18334 = "util.global.load"() <{global = @__auto.blk.28.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18335 = "torch_c.from_builtin_tensor"(%18334) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18336 = "util.global.load"() <{global = @"__auto.blk.28.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18337 = "torch_c.from_builtin_tensor"(%18336) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18338 = "util.global.load"() <{global = @"__auto.blk.28.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18339 = "torch_c.from_builtin_tensor"(%18338) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18340 = "util.global.load"() <{global = @"__auto.blk.28.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18341 = "torch_c.from_builtin_tensor"(%18340) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18342 = "util.global.load"() <{global = @"__auto.blk.28.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18343 = "torch_c.from_builtin_tensor"(%18342) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18344 = "util.global.load"() <{global = @"__auto.blk.28.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18345 = "torch_c.from_builtin_tensor"(%18344) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18346 = "util.global.load"() <{global = @"__auto.blk.28.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18347 = "torch_c.from_builtin_tensor"(%18346) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18348 = "util.global.load"() <{global = @__auto.blk.29.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18349 = "torch_c.from_builtin_tensor"(%18348) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18350 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18351 = "torch_c.from_builtin_tensor"(%18350) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18352 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18353 = "torch_c.from_builtin_tensor"(%18352) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18354 = "util.global.load"() <{global = @"__auto.blk.29.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18355 = "torch_c.from_builtin_tensor"(%18354) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18356 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18357 = "torch_c.from_builtin_tensor"(%18356) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18358 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18359 = "torch_c.from_builtin_tensor"(%18358) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18360 = "util.global.load"() <{global = @"__auto.blk.29.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18361 = "torch_c.from_builtin_tensor"(%18360) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18362 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18363 = "torch_c.from_builtin_tensor"(%18362) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18364 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18365 = "torch_c.from_builtin_tensor"(%18364) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18366 = "util.global.load"() <{global = @"__auto.blk.29.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18367 = "torch_c.from_builtin_tensor"(%18366) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18368 = "util.global.load"() <{global = @__auto.blk.29.attn_scale}> : () -> tensor<f32>
    %18369 = "torch_c.from_builtin_tensor"(%18368) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18370 = "util.global.load"() <{global = @"__auto.blk.29.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18371 = "torch_c.from_builtin_tensor"(%18370) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18372 = "util.global.load"() <{global = @"__auto.blk.29.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18373 = "torch_c.from_builtin_tensor"(%18372) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18374 = "util.global.load"() <{global = @__auto.blk.29.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18375 = "torch_c.from_builtin_tensor"(%18374) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18376 = "util.global.load"() <{global = @"__auto.blk.29.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18377 = "torch_c.from_builtin_tensor"(%18376) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18378 = "util.global.load"() <{global = @"__auto.blk.29.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18379 = "torch_c.from_builtin_tensor"(%18378) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18380 = "util.global.load"() <{global = @"__auto.blk.29.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18381 = "torch_c.from_builtin_tensor"(%18380) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18382 = "util.global.load"() <{global = @"__auto.blk.29.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18383 = "torch_c.from_builtin_tensor"(%18382) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18384 = "util.global.load"() <{global = @"__auto.blk.29.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18385 = "torch_c.from_builtin_tensor"(%18384) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18386 = "util.global.load"() <{global = @"__auto.blk.29.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18387 = "torch_c.from_builtin_tensor"(%18386) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18388 = "util.global.load"() <{global = @__auto.blk.30.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18389 = "torch_c.from_builtin_tensor"(%18388) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18390 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18391 = "torch_c.from_builtin_tensor"(%18390) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18392 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18393 = "torch_c.from_builtin_tensor"(%18392) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18394 = "util.global.load"() <{global = @"__auto.blk.30.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18395 = "torch_c.from_builtin_tensor"(%18394) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18396 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18397 = "torch_c.from_builtin_tensor"(%18396) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18398 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18399 = "torch_c.from_builtin_tensor"(%18398) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18400 = "util.global.load"() <{global = @"__auto.blk.30.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18401 = "torch_c.from_builtin_tensor"(%18400) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18402 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18403 = "torch_c.from_builtin_tensor"(%18402) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18404 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18405 = "torch_c.from_builtin_tensor"(%18404) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18406 = "util.global.load"() <{global = @"__auto.blk.30.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18407 = "torch_c.from_builtin_tensor"(%18406) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18408 = "util.global.load"() <{global = @__auto.blk.30.attn_scale}> : () -> tensor<f32>
    %18409 = "torch_c.from_builtin_tensor"(%18408) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18410 = "util.global.load"() <{global = @"__auto.blk.30.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18411 = "torch_c.from_builtin_tensor"(%18410) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18412 = "util.global.load"() <{global = @"__auto.blk.30.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18413 = "torch_c.from_builtin_tensor"(%18412) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18414 = "util.global.load"() <{global = @__auto.blk.30.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18415 = "torch_c.from_builtin_tensor"(%18414) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18416 = "util.global.load"() <{global = @"__auto.blk.30.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18417 = "torch_c.from_builtin_tensor"(%18416) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18418 = "util.global.load"() <{global = @"__auto.blk.30.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18419 = "torch_c.from_builtin_tensor"(%18418) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18420 = "util.global.load"() <{global = @"__auto.blk.30.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18421 = "torch_c.from_builtin_tensor"(%18420) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18422 = "util.global.load"() <{global = @"__auto.blk.30.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18423 = "torch_c.from_builtin_tensor"(%18422) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18424 = "util.global.load"() <{global = @"__auto.blk.30.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18425 = "torch_c.from_builtin_tensor"(%18424) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18426 = "util.global.load"() <{global = @"__auto.blk.30.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18427 = "torch_c.from_builtin_tensor"(%18426) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18428 = "util.global.load"() <{global = @__auto.blk.31.attn_norm.weight}> : () -> tensor<4096xbf16>
    %18429 = "torch_c.from_builtin_tensor"(%18428) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18430 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.q_input:rscale"}> : () -> tensor<f32>
    %18431 = "torch_c.from_builtin_tensor"(%18430) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18432 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18433 = "torch_c.from_builtin_tensor"(%18432) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18434 = "util.global.load"() <{global = @"__auto.blk.31.attn_q.q_output:rscale"}> : () -> tensor<f32>
    %18435 = "torch_c.from_builtin_tensor"(%18434) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18436 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.q_input:rscale"}> : () -> tensor<f32>
    %18437 = "torch_c.from_builtin_tensor"(%18436) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18438 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18439 = "torch_c.from_builtin_tensor"(%18438) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18440 = "util.global.load"() <{global = @"__auto.blk.31.attn_k.q_output:rscale"}> : () -> tensor<f32>
    %18441 = "torch_c.from_builtin_tensor"(%18440) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18442 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.q_input:rscale"}> : () -> tensor<f32>
    %18443 = "torch_c.from_builtin_tensor"(%18442) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18444 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.weight:qs"}> : () -> tensor<1024x4096xf8E4M3FNUZ>
    %18445 = "torch_c.from_builtin_tensor"(%18444) : (tensor<1024x4096xf8E4M3FNUZ>) -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %18446 = "util.global.load"() <{global = @"__auto.blk.31.attn_v.q_output:rscale"}> : () -> tensor<f32>
    %18447 = "torch_c.from_builtin_tensor"(%18446) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18448 = "util.global.load"() <{global = @__auto.blk.31.attn_scale}> : () -> tensor<f32>
    %18449 = "torch_c.from_builtin_tensor"(%18448) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18450 = "util.global.load"() <{global = @"__auto.blk.31.attn_output.q_input:rscale"}> : () -> tensor<f32>
    %18451 = "torch_c.from_builtin_tensor"(%18450) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18452 = "util.global.load"() <{global = @"__auto.blk.31.attn_output.weight:qs"}> : () -> tensor<4096x4096xf8E4M3FNUZ>
    %18453 = "torch_c.from_builtin_tensor"(%18452) : (tensor<4096x4096xf8E4M3FNUZ>) -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %18454 = "util.global.load"() <{global = @__auto.blk.31.ffn_norm.weight}> : () -> tensor<4096xbf16>
    %18455 = "torch_c.from_builtin_tensor"(%18454) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18456 = "util.global.load"() <{global = @"__auto.blk.31.ffn_gate.q_input:rscale"}> : () -> tensor<f32>
    %18457 = "torch_c.from_builtin_tensor"(%18456) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18458 = "util.global.load"() <{global = @"__auto.blk.31.ffn_gate.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18459 = "torch_c.from_builtin_tensor"(%18458) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18460 = "util.global.load"() <{global = @"__auto.blk.31.ffn_up.q_input:rscale"}> : () -> tensor<f32>
    %18461 = "torch_c.from_builtin_tensor"(%18460) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18462 = "util.global.load"() <{global = @"__auto.blk.31.ffn_up.weight:qs"}> : () -> tensor<14336x4096xf8E4M3FNUZ>
    %18463 = "torch_c.from_builtin_tensor"(%18462) : (tensor<14336x4096xf8E4M3FNUZ>) -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %18464 = "util.global.load"() <{global = @"__auto.blk.31.ffn_down.q_input:rscale"}> : () -> tensor<f32>
    %18465 = "torch_c.from_builtin_tensor"(%18464) : (tensor<f32>) -> !torch.vtensor<[],f32>
    %18466 = "util.global.load"() <{global = @"__auto.blk.31.ffn_down.weight:qs"}> : () -> tensor<4096x14336xf8E4M3FNUZ>
    %18467 = "torch_c.from_builtin_tensor"(%18466) : (tensor<4096x14336xf8E4M3FNUZ>) -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %18468 = "util.global.load"() <{global = @__auto.output_norm.weight}> : () -> tensor<4096xbf16>
    %18469 = "torch_c.from_builtin_tensor"(%18468) : (tensor<4096xbf16>) -> !torch.vtensor<[4096],bf16>
    %18470 = "util.global.load"() <{global = @__auto.output.weight}> : () -> tensor<128256x4096xbf16>
    %18471 = "torch_c.from_builtin_tensor"(%18470) : (tensor<128256x4096xbf16>) -> !torch.vtensor<[128256,4096],bf16>
    %18472 = "torch.copy.to_vtensor"(%arg70) : (!torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %18473 = "torch.symbolic_int"() <{max_val = 131040 : i64, min_val = 64 : i64, symbol_name = "32*s1"}> : () -> !torch.int
    %18474 = "torch.symbolic_int"() <{max_val = 4095 : i64, min_val = 2 : i64, symbol_name = "s1"}> : () -> !torch.int
    %18475 = "torch.symbolic_int"() <{max_val = 9223372036854775807 : i64, min_val = 0 : i64, symbol_name = "s2"}> : () -> !torch.int
    "torch.bind_symbolic_shape"(%arg67, %18474) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    "torch.bind_symbolic_shape"(%arg69, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    "torch.bind_symbolic_shape"(%18472, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %18476 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18477 = "torch.aten.size.int"(%arg69, %18476) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.int
    %18478 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18479 = "torch.aten.size.int"(%18472, %18478) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> !torch.int
    %18480 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18481 = "torch.aten.size.int"(%arg67, %18480) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.int
    %18482 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18483 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18484 = "torch.constant.none"() : () -> !torch.none
    %18485 = "torch.constant.none"() : () -> !torch.none
    %18486 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18487 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18488 = "torch.aten.arange.start_step"(%18482, %18481, %18483, %18484, %18485, %18486, %18487) : (!torch.int, !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%18488, %18474) <{shape_expressions = #map3}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %18489 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18490 = "torch.aten.unsqueeze"(%arg68, %18489) : (!torch.vtensor<[4],si64>, !torch.int) -> !torch.vtensor<[4,1],si64>
    %18491 = "torch.aten.ge.Tensor"(%18488, %18490) : (!torch.vtensor<[?],si64>, !torch.vtensor<[4,1],si64>) -> !torch.vtensor<[4,?],i1>
    "torch.bind_symbolic_shape"(%18491, %18474) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],i1>, !torch.int) -> ()
    %18492 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18493 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18494 = "torch.prim.ListConstruct"(%18492, %18493) : (!torch.int, !torch.int) -> !torch.list<int>
    %18495 = "torch.constant.int"() <{value = 11 : i64}> : () -> !torch.int
    %18496 = "torch.constant.none"() : () -> !torch.none
    %18497 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18498 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18499 = "torch.aten.ones"(%18494, %18495, %18496, %18497, %18498) : (!torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[1,1],i1>
    %18500 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18501 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18502 = "torch.prim.ListConstruct"(%18500, %18501) : (!torch.int, !torch.int) -> !torch.list<int>
    %18503 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18504 = "torch.aten.expand"(%18499, %18502, %18503) : (!torch.vtensor<[1,1],i1>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[131072,131072],i1>
    %18505 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18506 = "torch.aten.triu"(%18504, %18505) : (!torch.vtensor<[131072,131072],i1>, !torch.int) -> !torch.vtensor<[131072,131072],i1>
    %18507 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18508 = "torch.aten.unsqueeze"(%18506, %18507) : (!torch.vtensor<[131072,131072],i1>, !torch.int) -> !torch.vtensor<[1,131072,131072],i1>
    %18509 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18510 = "torch.aten.unsqueeze"(%18508, %18509) : (!torch.vtensor<[1,131072,131072],i1>, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18511 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18512 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18513 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18514 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18515 = "torch.aten.slice.Tensor"(%18510, %18511, %18512, %18513, %18514) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18516 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18518 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18519 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18520 = "torch.aten.slice.Tensor"(%18515, %18516, %18517, %18518, %18519) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18521 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18522 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18523 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18524 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18525 = "torch.aten.slice.Tensor"(%18520, %18521, %18522, %18523, %18524) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18526 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18527 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18528 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18529 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18530 = "torch.aten.slice.Tensor"(%18525, %18526, %18527, %18528, %18529) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,131072,131072],i1>
    %18531 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18532 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18533 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18534 = "torch.aten.slice.Tensor"(%18530, %18531, %18532, %18481, %18533) : (!torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,?,131072],i1>
    "torch.bind_symbolic_shape"(%18534, %18474) <{shape_expressions = #map4}> : (!torch.vtensor<[1,1,?,131072],i1>, !torch.int) -> ()
    %18535 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18536 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18537 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18538 = "torch.aten.slice.Tensor"(%18534, %18535, %18536, %18481, %18537) : (!torch.vtensor<[1,1,?,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,1,?,?],i1>
    "torch.bind_symbolic_shape"(%18538, %18474) <{shape_expressions = #map5}> : (!torch.vtensor<[1,1,?,?],i1>, !torch.int) -> ()
    %18539 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18540 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18541 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18543 = "torch.aten.slice.Tensor"(%18491, %18539, %18540, %18541, %18542) : (!torch.vtensor<[4,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?],i1>
    "torch.bind_symbolic_shape"(%18543, %18474) <{shape_expressions = #map}> : (!torch.vtensor<[4,?],i1>, !torch.int) -> ()
    %18544 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18545 = "torch.aten.unsqueeze"(%18543, %18544) : (!torch.vtensor<[4,?],i1>, !torch.int) -> !torch.vtensor<[4,1,?],i1>
    "torch.bind_symbolic_shape"(%18545, %18474) <{shape_expressions = #map6}> : (!torch.vtensor<[4,1,?],i1>, !torch.int) -> ()
    %18546 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18547 = "torch.aten.unsqueeze"(%18545, %18546) : (!torch.vtensor<[4,1,?],i1>, !torch.int) -> !torch.vtensor<[4,1,1,?],i1>
    "torch.bind_symbolic_shape"(%18547, %18474) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],i1>, !torch.int) -> ()
    %18548 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18549 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18550 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18551 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18552 = "torch.aten.slice.Tensor"(%18547, %18548, %18549, %18550, %18551) : (!torch.vtensor<[4,1,1,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,1,1,?],i1>
    "torch.bind_symbolic_shape"(%18552, %18474) <{shape_expressions = #map7}> : (!torch.vtensor<[4,1,1,?],i1>, !torch.int) -> ()
    %18553 = "torch.aten.logical_or"(%18538, %18552) : (!torch.vtensor<[1,1,?,?],i1>, !torch.vtensor<[4,1,1,?],i1>) -> !torch.vtensor<[4,1,?,?],i1>
    "torch.bind_symbolic_shape"(%18553, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],i1>, !torch.int) -> ()
    %18554 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18555 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18556 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18557 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18558 = "torch.constant.none"() : () -> !torch.none
    %18559 = "torch.aten.scalar_tensor"(%18554, %18555, %18556, %18557, %18558) : (!torch.int, !torch.int, !torch.int, !torch.Device, !torch.none) -> !torch.vtensor<[],f32>
    %18560 = "torch.constant.float"() <{value = 0xFFF0000000000000 : f64}> : () -> !torch.float
    %18561 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18562 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18563 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18564 = "torch.constant.none"() : () -> !torch.none
    %18565 = "torch.aten.scalar_tensor"(%18560, %18561, %18562, %18563, %18564) : (!torch.float, !torch.int, !torch.int, !torch.Device, !torch.none) -> !torch.vtensor<[],f32>
    %18566 = "torch.aten.where.self"(%18553, %18565, %18559) : (!torch.vtensor<[4,1,?,?],i1>, !torch.vtensor<[],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,1,?,?],f32>
    "torch.bind_symbolic_shape"(%18566, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f32>, !torch.int) -> ()
    %18567 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18568 = "torch.prims.convert_element_type"(%18566, %18567) : (!torch.vtensor<[4,1,?,?],f32>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18568, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %18569 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18570 = "torch.prims.convert_element_type"(%18568, %18569) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18570, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %18571 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18572 = "torch.prims.convert_element_type"(%17187, %18571) : (!torch.vtensor<[128256,4096],bf16>, !torch.int) -> !torch.vtensor<[128256,4096],bf16>
    %18573 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18574 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18575 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18576 = "torch.aten.embedding"(%18572, %arg67, %18573, %18574, %18575) : (!torch.vtensor<[128256,4096],bf16>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18576, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18577 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18578 = "torch.prims.convert_element_type"(%18576, %18577) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18578, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18579 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18580 = "torch.aten.pow.Tensor_Scalar"(%18578, %18579) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18580, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18581 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18582 = "torch.prim.ListConstruct"(%18581) : (!torch.int) -> !torch.list<int>
    %18583 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %18584 = "torch.constant.none"() : () -> !torch.none
    %18585 = "torch.aten.mean.dim"(%18580, %18582, %18583, %18584) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%18585, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %18586 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %18587 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18588 = "torch.aten.add.Scalar"(%18585, %18586, %18587) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%18588, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %18589 = "torch.aten.rsqrt"(%18588) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%18589, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %18590 = "torch.aten.mul.Tensor"(%18578, %18589) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18590, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18591 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18592 = "torch.prims.convert_element_type"(%18590, %18591) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18592, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18593 = "torch.aten.mul.Tensor"(%17189, %18592) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18593, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18594 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18595 = "torch.prims.convert_element_type"(%18593, %18594) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18595, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18596 = "torch.aten.div.Tensor"(%18595, %17191) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18596, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18597 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18598 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18599 = "torch.aten.clamp"(%18596, %18597, %18598) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18599, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18600 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18601 = "torch.prims.convert_element_type"(%18599, %18600) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18601, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18602 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18603 = "torch.aten.unsqueeze"(%17193, %18602) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %18604 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18605 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18606 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18607 = "torch.prim.ListConstruct"(%18604, %18605, %18606) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18608 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18609 = "torch.aten.expand"(%18603, %18607, %18608) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %18610 = "torch_c.to_builtin_tensor"(%18601) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %18611 = "torch_c.to_builtin_tensor"(%18609) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %18612 = "util.call"(%18610, %18611) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %18613 = "torch_c.from_builtin_tensor"(%18612) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18613, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18614 = "torch.aten.div.Tensor"(%18613, %17195) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18614, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18615 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18616 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18617 = "torch.aten.clamp"(%18614, %18615, %18616) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%18617, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %18618 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18619 = "torch.prims.convert_element_type"(%18617, %18618) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18619, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18620 = "torch.aten.div.Tensor"(%18595, %17197) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18620, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18621 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18622 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18623 = "torch.aten.clamp"(%18620, %18621, %18622) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18623, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18624 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18625 = "torch.prims.convert_element_type"(%18623, %18624) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18625, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18626 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18627 = "torch.aten.unsqueeze"(%17199, %18626) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %18628 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18629 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %18630 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18631 = "torch.prim.ListConstruct"(%18628, %18629, %18630) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18632 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18633 = "torch.aten.expand"(%18627, %18631, %18632) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %18634 = "torch_c.to_builtin_tensor"(%18625) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %18635 = "torch_c.to_builtin_tensor"(%18633) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %18636 = "util.call"(%18634, %18635) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %18637 = "torch_c.from_builtin_tensor"(%18636) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18637, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18638 = "torch.aten.div.Tensor"(%18637, %17201) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18638, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18639 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18640 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18641 = "torch.aten.clamp"(%18638, %18639, %18640) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18641, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18642 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18643 = "torch.prims.convert_element_type"(%18641, %18642) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18643, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %18644 = "torch.aten.div.Tensor"(%18595, %17203) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18644, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18645 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18646 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18647 = "torch.aten.clamp"(%18644, %18645, %18646) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%18647, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %18648 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18649 = "torch.prims.convert_element_type"(%18647, %18648) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18649, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %18650 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18651 = "torch.aten.unsqueeze"(%17205, %18650) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %18652 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18653 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %18654 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %18655 = "torch.prim.ListConstruct"(%18652, %18653, %18654) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18656 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18657 = "torch.aten.expand"(%18651, %18655, %18656) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %18658 = "torch_c.to_builtin_tensor"(%18649) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %18659 = "torch_c.to_builtin_tensor"(%18657) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %18660 = "util.call"(%18658, %18659) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %18661 = "torch_c.from_builtin_tensor"(%18660) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18661, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18662 = "torch.aten.div.Tensor"(%18661, %17207) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18662, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18663 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %18664 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %18665 = "torch.aten.clamp"(%18662, %18663, %18664) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%18665, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %18666 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %18667 = "torch.prims.convert_element_type"(%18665, %18666) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18667, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %18668 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18669 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18670 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18671 = "torch.prim.ListConstruct"(%18668, %18481, %18669, %18670) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18672 = "torch.aten.view"(%18619, %18671) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18672, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18673 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18674 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18675 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18676 = "torch.prim.ListConstruct"(%18673, %18481, %18674, %18675) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18677 = "torch.aten.view"(%18643, %18676) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18677, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18678 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18679 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18680 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18681 = "torch.prim.ListConstruct"(%18678, %18481, %18679, %18680) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18682 = "torch.aten.view"(%18667, %18681) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18682, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18683 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18684 = "torch.constant.none"() : () -> !torch.none
    %18685 = "torch.constant.none"() : () -> !torch.none
    %18686 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18687 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18688 = "torch.aten.arange"(%18683, %18684, %18685, %18686, %18687) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %18689 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18690 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18691 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18692 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18693 = "torch.constant.none"() : () -> !torch.none
    %18694 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18695 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18696 = "torch.aten.arange.start_step"(%18689, %18690, %18691, %18692, %18693, %18694, %18695) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %18697 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18698 = "torch.prims.convert_element_type"(%18696, %18697) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %18699 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18700 = "torch.aten.div.Scalar"(%18698, %18699) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18701 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %18702 = "torch.aten.pow.Scalar"(%18701, %18700) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18703 = "torch.aten.reciprocal"(%18702) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18704 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %18705 = "torch.aten.mul.Scalar"(%18703, %18704) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18706 = "torch.aten.reciprocal"(%18705) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18707 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %18708 = "torch.aten.mul.Scalar"(%18706, %18707) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18709 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18710 = "torch.aten.gt.Scalar"(%18708, %18709) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18711 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18712 = "torch.aten.div.Scalar"(%18705, %18711) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18713 = "torch.aten.where.self"(%18710, %18712, %18705) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18714 = "torch.aten.reciprocal"(%18708) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18715 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %18716 = "torch.aten.mul.Scalar"(%18714, %18715) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18717 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18718 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18719 = "torch.aten.sub.Scalar"(%18716, %18717, %18718) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18720 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18721 = "torch.aten.div.Scalar"(%18719, %18720) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18722 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18723 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18724 = "torch.aten.rsub.Scalar"(%18721, %18722, %18723) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18725 = "torch.aten.mul.Tensor"(%18724, %18713) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18726 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18727 = "torch.aten.div.Scalar"(%18725, %18726) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18728 = "torch.aten.mul.Tensor"(%18721, %18713) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18729 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18730 = "torch.aten.add.Tensor"(%18727, %18728, %18729) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18731 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %18732 = "torch.aten.lt.Scalar"(%18708, %18731) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18733 = "torch.aten.bitwise_not"(%18732) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18734 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18735 = "torch.aten.gt.Scalar"(%18708, %18734) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18736 = "torch.aten.bitwise_not"(%18735) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18737 = "torch.aten.mul.Tensor"(%18733, %18736) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18738 = "torch.aten.where.self"(%18737, %18730, %18713) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18739 = "torch.prim.ListConstruct"(%18738, %18738) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %18740 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18741 = "torch.aten.cat"(%18739, %18740) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %18742 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18743 = "torch.prims.convert_element_type"(%18688, %18742) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %18744 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18745 = "torch.prims.convert_element_type"(%18741, %18744) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %18746 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18747 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18748 = "torch.prim.ListConstruct"(%18746, %18747) : (!torch.int, !torch.int) -> !torch.list<int>
    %18749 = "torch.aten.view"(%18743, %18748) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %18750 = "torch.aten.mul.Tensor"(%18749, %18745) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18751 = "torch.aten.cos"(%18750) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18752 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18753 = "torch.prims.convert_element_type"(%18751, %18752) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18754 = "torch.aten.sin"(%18750) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18755 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18756 = "torch.prims.convert_element_type"(%18754, %18755) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18757 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18758 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18759 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18760 = "torch.aten.slice.Tensor"(%18753, %18757, %18758, %18481, %18759) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18760, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18761 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18762 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18763 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18764 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18765 = "torch.aten.slice.Tensor"(%18760, %18761, %18762, %18763, %18764) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18765, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18766 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18767 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18768 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18769 = "torch.aten.slice.Tensor"(%18756, %18766, %18767, %18481, %18768) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18769, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18770 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18771 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18772 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18773 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18774 = "torch.aten.slice.Tensor"(%18769, %18770, %18771, %18772, %18773) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18774, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18775 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18776 = "torch.aten.unsqueeze"(%18765, %18775) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18776, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18778 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18779 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18780 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18781 = "torch.aten.slice.Tensor"(%18776, %18777, %18778, %18779, %18780) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18781, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18782 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18783 = "torch.aten.unsqueeze"(%18781, %18782) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18783, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18784 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18785 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18786 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18787 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18788 = "torch.aten.slice.Tensor"(%18783, %18784, %18785, %18786, %18787) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18788, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18789 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18790 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18791 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18792 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18793 = "torch.prim.ListConstruct"(%18789, %18790, %18791, %18792) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18794 = "torch.aten.repeat"(%18788, %18793) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18794, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18795 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18796 = "torch.aten.unsqueeze"(%18774, %18795) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18796, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18797 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18798 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18799 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18800 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18801 = "torch.aten.slice.Tensor"(%18796, %18797, %18798, %18799, %18800) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18801, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18802 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18803 = "torch.aten.unsqueeze"(%18801, %18802) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18803, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18804 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18805 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18806 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18807 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18808 = "torch.aten.slice.Tensor"(%18803, %18804, %18805, %18806, %18807) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18808, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18809 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18810 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18811 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18812 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18813 = "torch.prim.ListConstruct"(%18809, %18810, %18811, %18812) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18814 = "torch.aten.repeat"(%18808, %18813) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18814, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18815 = "torch.aten.mul.Tensor"(%18672, %18794) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18815, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18816 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18817 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18818 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18819 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18820 = "torch.aten.slice.Tensor"(%18672, %18816, %18817, %18818, %18819) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18820, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18821 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18822 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18823 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18825 = "torch.aten.slice.Tensor"(%18672, %18821, %18822, %18823, %18824) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18825, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18826 = "torch.aten.neg"(%18825) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18826, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18827 = "torch.prim.ListConstruct"(%18826, %18820) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %18828 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18829 = "torch.aten.cat"(%18827, %18828) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18829, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18830 = "torch.aten.mul.Tensor"(%18829, %18814) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18830, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18831 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18832 = "torch.aten.add.Tensor"(%18815, %18830, %18831) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18832, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18833 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18834 = "torch.constant.none"() : () -> !torch.none
    %18835 = "torch.constant.none"() : () -> !torch.none
    %18836 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18837 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18838 = "torch.aten.arange"(%18833, %18834, %18835, %18836, %18837) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %18839 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18840 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18841 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18842 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18843 = "torch.constant.none"() : () -> !torch.none
    %18844 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %18845 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %18846 = "torch.aten.arange.start_step"(%18839, %18840, %18841, %18842, %18843, %18844, %18845) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %18847 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18848 = "torch.prims.convert_element_type"(%18846, %18847) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %18849 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18850 = "torch.aten.div.Scalar"(%18848, %18849) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18851 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %18852 = "torch.aten.pow.Scalar"(%18851, %18850) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18853 = "torch.aten.reciprocal"(%18852) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18854 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %18855 = "torch.aten.mul.Scalar"(%18853, %18854) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18856 = "torch.aten.reciprocal"(%18855) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18857 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %18858 = "torch.aten.mul.Scalar"(%18856, %18857) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %18859 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18860 = "torch.aten.gt.Scalar"(%18858, %18859) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18861 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18862 = "torch.aten.div.Scalar"(%18855, %18861) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18863 = "torch.aten.where.self"(%18860, %18862, %18855) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18864 = "torch.aten.reciprocal"(%18858) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18865 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %18866 = "torch.aten.mul.Scalar"(%18864, %18865) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18867 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18868 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18869 = "torch.aten.sub.Scalar"(%18866, %18867, %18868) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18870 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18871 = "torch.aten.div.Scalar"(%18869, %18870) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18872 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18873 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18874 = "torch.aten.rsub.Scalar"(%18871, %18872, %18873) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %18875 = "torch.aten.mul.Tensor"(%18874, %18863) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18876 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18877 = "torch.aten.div.Scalar"(%18875, %18876) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18878 = "torch.aten.mul.Tensor"(%18871, %18863) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18879 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18880 = "torch.aten.add.Tensor"(%18877, %18878, %18879) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %18881 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %18882 = "torch.aten.lt.Scalar"(%18858, %18881) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18883 = "torch.aten.bitwise_not"(%18882) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18884 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %18885 = "torch.aten.gt.Scalar"(%18858, %18884) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %18886 = "torch.aten.bitwise_not"(%18885) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18887 = "torch.aten.mul.Tensor"(%18883, %18886) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %18888 = "torch.aten.where.self"(%18887, %18880, %18863) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %18889 = "torch.prim.ListConstruct"(%18888, %18888) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %18890 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18891 = "torch.aten.cat"(%18889, %18890) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %18892 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18893 = "torch.prims.convert_element_type"(%18838, %18892) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %18894 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %18895 = "torch.prims.convert_element_type"(%18891, %18894) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %18896 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %18897 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18898 = "torch.prim.ListConstruct"(%18896, %18897) : (!torch.int, !torch.int) -> !torch.list<int>
    %18899 = "torch.aten.view"(%18893, %18898) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %18900 = "torch.aten.mul.Tensor"(%18899, %18895) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18901 = "torch.aten.cos"(%18900) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18902 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18903 = "torch.prims.convert_element_type"(%18901, %18902) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18904 = "torch.aten.sin"(%18900) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %18905 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %18906 = "torch.prims.convert_element_type"(%18904, %18905) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %18907 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18908 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18909 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18910 = "torch.aten.slice.Tensor"(%18903, %18907, %18908, %18481, %18909) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18910, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18911 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18913 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18915 = "torch.aten.slice.Tensor"(%18910, %18911, %18912, %18913, %18914) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18915, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18916 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18917 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18918 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18919 = "torch.aten.slice.Tensor"(%18906, %18916, %18917, %18481, %18918) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18919, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18920 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18921 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18922 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18923 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18924 = "torch.aten.slice.Tensor"(%18919, %18920, %18921, %18922, %18923) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%18924, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %18925 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18926 = "torch.aten.unsqueeze"(%18915, %18925) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18926, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18928 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18929 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18930 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18931 = "torch.aten.slice.Tensor"(%18926, %18927, %18928, %18929, %18930) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18931, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18932 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18933 = "torch.aten.unsqueeze"(%18931, %18932) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18933, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18934 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18935 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18936 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18937 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18938 = "torch.aten.slice.Tensor"(%18933, %18934, %18935, %18936, %18937) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18938, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18939 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18940 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18941 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18942 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18943 = "torch.prim.ListConstruct"(%18939, %18940, %18941, %18942) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18944 = "torch.aten.repeat"(%18938, %18943) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18944, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18945 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18946 = "torch.aten.unsqueeze"(%18924, %18945) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18946, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18947 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18948 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18949 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18950 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18951 = "torch.aten.slice.Tensor"(%18946, %18947, %18948, %18949, %18950) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%18951, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %18952 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18953 = "torch.aten.unsqueeze"(%18951, %18952) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18953, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18954 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18955 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18956 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18957 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18958 = "torch.aten.slice.Tensor"(%18953, %18954, %18955, %18956, %18957) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18958, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %18959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %18960 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18961 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18962 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18963 = "torch.prim.ListConstruct"(%18959, %18960, %18961, %18962) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18964 = "torch.aten.repeat"(%18958, %18963) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%18964, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %18965 = "torch.aten.mul.Tensor"(%18677, %18944) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18965, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18966 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18967 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %18968 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18969 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18970 = "torch.aten.slice.Tensor"(%18677, %18966, %18967, %18968, %18969) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18970, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18971 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %18972 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %18973 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %18974 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18975 = "torch.aten.slice.Tensor"(%18677, %18971, %18972, %18973, %18974) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18975, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18976 = "torch.aten.neg"(%18975) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18976, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %18977 = "torch.prim.ListConstruct"(%18976, %18970) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %18978 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %18979 = "torch.aten.cat"(%18977, %18978) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18979, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18980 = "torch.aten.mul.Tensor"(%18979, %18964) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18980, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18981 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %18982 = "torch.aten.add.Tensor"(%18965, %18980, %18981) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18982, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18983 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18984 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18985 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18986 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18987 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18988 = "torch.prim.ListConstruct"(%18479, %18983, %18984, %18985, %18986, %18987) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18989 = "torch.aten.view"(%18472, %18988) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18989, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18990 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18991 = "torch.aten.mul.int"(%18479, %18990) : (!torch.int, !torch.int) -> !torch.int
    %18992 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %18993 = "torch.aten.mul.int"(%18991, %18992) : (!torch.int, !torch.int) -> !torch.int
    %18994 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %18995 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %18996 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %18997 = "torch.prim.ListConstruct"(%18993, %18994, %18995, %18996) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %18998 = "torch.aten.view"(%18989, %18997) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%18998, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %18999 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19000 = "torch.aten.mul.Scalar"(%arg69, %18999) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19000, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19001 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19002 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19003 = "torch.aten.add.Scalar"(%19000, %19001, %19002) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19003, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19004 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19005 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19006 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19007 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19008 = "torch.prim.ListConstruct"(%19004, %18477, %19005, %19006, %19007) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19009 = "torch.aten.view"(%18982, %19008) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19009, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19010 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19011 = "torch.aten.mul.int"(%19010, %18477) : (!torch.int, !torch.int) -> !torch.int
    %19012 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19013 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19014 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19015 = "torch.prim.ListConstruct"(%19011, %19012, %19013, %19014) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19016 = "torch.aten.view"(%19009, %19015) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19016, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19017 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19018 = "torch.aten.view"(%19003, %19017) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19018, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19019 = "torch.prim.ListConstruct"(%19018) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19020 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19021 = "torch.aten.index_put"(%18998, %19019, %19016, %19020) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19021, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19022 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19023 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19024 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19025 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19026 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19027 = "torch.prim.ListConstruct"(%18479, %19022, %19023, %19024, %19025, %19026) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19028 = "torch.aten.view"(%19021, %19027) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19028, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19029 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19030 = "torch.prim.ListConstruct"(%18479, %19029) : (!torch.int, !torch.int) -> !torch.list<int>
    %19031 = "torch.aten.view"(%19028, %19030) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19031, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19032 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19033 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19034 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19035 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19036 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19037 = "torch.prim.ListConstruct"(%18479, %19032, %19033, %19034, %19035, %19036) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19038 = "torch.aten.view"(%19031, %19037) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19038, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19039 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19040 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19041 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19042 = "torch.prim.ListConstruct"(%18993, %19039, %19040, %19041) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19043 = "torch.aten.view"(%19038, %19042) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19043, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19044 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19045 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19046 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19047 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19048 = "torch.prim.ListConstruct"(%19044, %18477, %19045, %19046, %19047) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19049 = "torch.aten.view"(%18682, %19048) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19049, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19050 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19051 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19052 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19053 = "torch.prim.ListConstruct"(%19011, %19050, %19051, %19052) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19054 = "torch.aten.view"(%19049, %19053) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19054, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19055 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19056 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19057 = "torch.aten.add.Scalar"(%19003, %19055, %19056) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19057, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19058 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19059 = "torch.aten.view"(%19057, %19058) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19059, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19060 = "torch.prim.ListConstruct"(%19059) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19061 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19062 = "torch.aten.index_put"(%19043, %19060, %19054, %19061) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19062, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19063 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19064 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19065 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19066 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19067 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19068 = "torch.prim.ListConstruct"(%18479, %19063, %19064, %19065, %19066, %19067) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19069 = "torch.aten.view"(%19062, %19068) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19069, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19070 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19071 = "torch.prim.ListConstruct"(%18479, %19070) : (!torch.int, !torch.int) -> !torch.list<int>
    %19072 = "torch.aten.view"(%19069, %19071) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19072, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19073 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19074 = "torch.aten.unsqueeze"(%18982, %19073) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19074, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19075 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19076 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19077 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19078 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19079 = "torch.prim.ListConstruct"(%19075, %18481, %19076, %19077, %19078) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19080 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19081 = "torch.aten.expand"(%19074, %19079, %19080) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19081, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19082 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19083 = "torch.aten.clone"(%19081, %19082) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19083, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19084 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19085 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19086 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19087 = "torch.prim.ListConstruct"(%19084, %18481, %19085, %19086) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19088 = "torch.aten._unsafe_view"(%19083, %19087) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19088, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19089 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19090 = "torch.aten.unsqueeze"(%18682, %19089) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19090, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19091 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19092 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19093 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19094 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19095 = "torch.prim.ListConstruct"(%19091, %18481, %19092, %19093, %19094) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19096 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19097 = "torch.aten.expand"(%19090, %19095, %19096) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19097, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19098 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19099 = "torch.aten.clone"(%19097, %19098) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19099, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19100 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19101 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19102 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19103 = "torch.prim.ListConstruct"(%19100, %18481, %19101, %19102) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19104 = "torch.aten._unsafe_view"(%19099, %19103) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19104, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19105 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19106 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19107 = "torch.aten.transpose.int"(%18832, %19105, %19106) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19107, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19108 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19109 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19110 = "torch.aten.transpose.int"(%19088, %19108, %19109) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19110, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19111 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19112 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19113 = "torch.aten.transpose.int"(%19104, %19111, %19112) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19113, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19114 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19115 = "torch.aten.squeeze.dim"(%18570, %19114) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19115, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19116 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19117 = "torch.aten.squeeze.dim"(%19115, %19116) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19117, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19118 = "torch_c.to_builtin_tensor"(%19107) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19119 = "torch_c.to_builtin_tensor"(%19110) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19120 = "torch_c.to_builtin_tensor"(%19113) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19121 = "torch_c.to_builtin_tensor"(%19117) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %19122 = "tensor.cast"(%19121) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %19123 = "torch_c.to_builtin_tensor"(%17209) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %19124 = "util.call"(%19118, %19119, %19120, %19123, %19122) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %19125 = "torch_c.from_builtin_tensor"(%19124) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%19125, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %19126 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19127 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19128 = "torch.aten.transpose.int"(%19125, %19126, %19127) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19128, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19129 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19130 = "torch.aten.clone"(%19128, %19129) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19130, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19131 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19132 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19133 = "torch.prim.ListConstruct"(%19131, %18481, %19132) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19134 = "torch.aten._unsafe_view"(%19130, %19133) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19134, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19135 = "torch.aten.div.Tensor"(%19134, %17211) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19135, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19136 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19137 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19138 = "torch.aten.clamp"(%19135, %19136, %19137) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19138, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19139 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19140 = "torch.prims.convert_element_type"(%19138, %19139) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19140, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19141 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19142 = "torch.aten.unsqueeze"(%17213, %19141) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19143 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19144 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19145 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19146 = "torch.prim.ListConstruct"(%19143, %19144, %19145) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19147 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19148 = "torch.aten.expand"(%19142, %19146, %19147) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19149 = "torch_c.to_builtin_tensor"(%19140) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19150 = "torch_c.to_builtin_tensor"(%19148) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19151 = "util.call"(%19149, %19150) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19152 = "torch_c.from_builtin_tensor"(%19151) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19152, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19153 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19154 = "torch.prims.convert_element_type"(%19152, %19153) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19154, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19155 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19156 = "torch.aten.add.Tensor"(%18576, %19154, %19155) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19156, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19157 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19158 = "torch.prims.convert_element_type"(%19156, %19157) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19158, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19159 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19160 = "torch.aten.pow.Tensor_Scalar"(%19158, %19159) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19160, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19161 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19162 = "torch.prim.ListConstruct"(%19161) : (!torch.int) -> !torch.list<int>
    %19163 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19164 = "torch.constant.none"() : () -> !torch.none
    %19165 = "torch.aten.mean.dim"(%19160, %19162, %19163, %19164) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19165, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19166 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19167 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19168 = "torch.aten.add.Scalar"(%19165, %19166, %19167) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19168, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19169 = "torch.aten.rsqrt"(%19168) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19169, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19170 = "torch.aten.mul.Tensor"(%19158, %19169) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19170, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19171 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19172 = "torch.prims.convert_element_type"(%19170, %19171) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19172, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19173 = "torch.aten.mul.Tensor"(%17215, %19172) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19173, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19174 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19175 = "torch.prims.convert_element_type"(%19173, %19174) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19175, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19176 = "torch.aten.div.Tensor"(%19175, %17217) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19176, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19177 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19178 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19179 = "torch.aten.clamp"(%19176, %19177, %19178) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19179, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19180 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19181 = "torch.prims.convert_element_type"(%19179, %19180) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19181, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19182 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19183 = "torch.aten.unsqueeze"(%17219, %19182) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19184 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19185 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19186 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19187 = "torch.prim.ListConstruct"(%19184, %19185, %19186) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19188 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19189 = "torch.aten.expand"(%19183, %19187, %19188) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19190 = "torch_c.to_builtin_tensor"(%19181) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19191 = "torch_c.to_builtin_tensor"(%19189) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19192 = "util.call"(%19190, %19191) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19193 = "torch_c.from_builtin_tensor"(%19192) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19193, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19194 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19195 = "torch.prims.convert_element_type"(%19193, %19194) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19195, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19196 = "torch.aten.silu"(%19195) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19196, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19197 = "torch.aten.div.Tensor"(%19175, %17221) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19197, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19198 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19199 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19200 = "torch.aten.clamp"(%19197, %19198, %19199) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19200, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19201 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19202 = "torch.prims.convert_element_type"(%19200, %19201) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19202, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19203 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19204 = "torch.aten.unsqueeze"(%17223, %19203) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19205 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19206 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19207 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19208 = "torch.prim.ListConstruct"(%19205, %19206, %19207) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19209 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19210 = "torch.aten.expand"(%19204, %19208, %19209) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19211 = "torch_c.to_builtin_tensor"(%19202) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19212 = "torch_c.to_builtin_tensor"(%19210) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19213 = "util.call"(%19211, %19212) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19214 = "torch_c.from_builtin_tensor"(%19213) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19214, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19215 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19216 = "torch.prims.convert_element_type"(%19214, %19215) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19216, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19217 = "torch.aten.mul.Tensor"(%19196, %19216) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19217, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19218 = "torch.aten.div.Tensor"(%19217, %17225) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19218, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19219 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19220 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19221 = "torch.aten.clamp"(%19218, %19219, %19220) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19221, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19222 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19223 = "torch.prims.convert_element_type"(%19221, %19222) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19223, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %19224 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19225 = "torch.aten.unsqueeze"(%17227, %19224) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %19226 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19227 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19228 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19229 = "torch.prim.ListConstruct"(%19226, %19227, %19228) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19230 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19231 = "torch.aten.expand"(%19225, %19229, %19230) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %19232 = "torch_c.to_builtin_tensor"(%19223) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %19233 = "torch_c.to_builtin_tensor"(%19231) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %19234 = "util.call"(%19232, %19233) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19235 = "torch_c.from_builtin_tensor"(%19234) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19235, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19236 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19237 = "torch.prims.convert_element_type"(%19235, %19236) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19237, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19239 = "torch.aten.add.Tensor"(%19156, %19237, %19238) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19239, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19240 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19241 = "torch.prims.convert_element_type"(%19239, %19240) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19241, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19242 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19243 = "torch.aten.pow.Tensor_Scalar"(%19241, %19242) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19243, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19244 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19245 = "torch.prim.ListConstruct"(%19244) : (!torch.int) -> !torch.list<int>
    %19246 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19247 = "torch.constant.none"() : () -> !torch.none
    %19248 = "torch.aten.mean.dim"(%19243, %19245, %19246, %19247) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19248, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19249 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19251 = "torch.aten.add.Scalar"(%19248, %19249, %19250) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19251, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19252 = "torch.aten.rsqrt"(%19251) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19252, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19253 = "torch.aten.mul.Tensor"(%19241, %19252) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19253, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19254 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19255 = "torch.prims.convert_element_type"(%19253, %19254) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19255, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19256 = "torch.aten.mul.Tensor"(%17229, %19255) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19256, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19257 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19258 = "torch.prims.convert_element_type"(%19256, %19257) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19258, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19259 = "torch.aten.div.Tensor"(%19258, %17231) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19259, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19260 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19261 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19262 = "torch.aten.clamp"(%19259, %19260, %19261) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19262, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19263 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19264 = "torch.prims.convert_element_type"(%19262, %19263) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19264, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19266 = "torch.aten.unsqueeze"(%17233, %19265) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19267 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19268 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19269 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19270 = "torch.prim.ListConstruct"(%19267, %19268, %19269) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19271 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19272 = "torch.aten.expand"(%19266, %19270, %19271) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19273 = "torch_c.to_builtin_tensor"(%19264) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19274 = "torch_c.to_builtin_tensor"(%19272) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19275 = "util.call"(%19273, %19274) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19276 = "torch_c.from_builtin_tensor"(%19275) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19276, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19277 = "torch.aten.div.Tensor"(%19276, %17235) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19277, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19278 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19279 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19280 = "torch.aten.clamp"(%19277, %19278, %19279) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19280, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19281 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19282 = "torch.prims.convert_element_type"(%19280, %19281) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19282, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19283 = "torch.aten.div.Tensor"(%19258, %17237) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19283, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19284 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19285 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19286 = "torch.aten.clamp"(%19283, %19284, %19285) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19286, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19287 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19288 = "torch.prims.convert_element_type"(%19286, %19287) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19288, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19289 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19290 = "torch.aten.unsqueeze"(%17239, %19289) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19291 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19292 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19293 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19294 = "torch.prim.ListConstruct"(%19291, %19292, %19293) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19295 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19296 = "torch.aten.expand"(%19290, %19294, %19295) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19297 = "torch_c.to_builtin_tensor"(%19288) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19298 = "torch_c.to_builtin_tensor"(%19296) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19299 = "util.call"(%19297, %19298) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19300 = "torch_c.from_builtin_tensor"(%19299) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19300, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19301 = "torch.aten.div.Tensor"(%19300, %17241) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19301, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19302 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19303 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19304 = "torch.aten.clamp"(%19301, %19302, %19303) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19304, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19305 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19306 = "torch.prims.convert_element_type"(%19304, %19305) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19306, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19307 = "torch.aten.div.Tensor"(%19258, %17243) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19307, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19308 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19309 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19310 = "torch.aten.clamp"(%19307, %19308, %19309) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19310, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19311 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19312 = "torch.prims.convert_element_type"(%19310, %19311) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19312, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19313 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19314 = "torch.aten.unsqueeze"(%17245, %19313) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19315 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19316 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19317 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19318 = "torch.prim.ListConstruct"(%19315, %19316, %19317) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19319 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19320 = "torch.aten.expand"(%19314, %19318, %19319) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19321 = "torch_c.to_builtin_tensor"(%19312) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19322 = "torch_c.to_builtin_tensor"(%19320) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19323 = "util.call"(%19321, %19322) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19324 = "torch_c.from_builtin_tensor"(%19323) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19324, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19325 = "torch.aten.div.Tensor"(%19324, %17247) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19325, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19326 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19327 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19328 = "torch.aten.clamp"(%19325, %19326, %19327) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19328, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19329 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19330 = "torch.prims.convert_element_type"(%19328, %19329) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19330, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19331 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19332 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19333 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19334 = "torch.prim.ListConstruct"(%19331, %18481, %19332, %19333) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19335 = "torch.aten.view"(%19282, %19334) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19335, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19336 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19337 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19338 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19339 = "torch.prim.ListConstruct"(%19336, %18481, %19337, %19338) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19340 = "torch.aten.view"(%19306, %19339) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19340, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19341 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19342 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19343 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19344 = "torch.prim.ListConstruct"(%19341, %18481, %19342, %19343) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19345 = "torch.aten.view"(%19330, %19344) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19345, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19346 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19347 = "torch.constant.none"() : () -> !torch.none
    %19348 = "torch.constant.none"() : () -> !torch.none
    %19349 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19350 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19351 = "torch.aten.arange"(%19346, %19347, %19348, %19349, %19350) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %19352 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19353 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19354 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19355 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19356 = "torch.constant.none"() : () -> !torch.none
    %19357 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19358 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19359 = "torch.aten.arange.start_step"(%19352, %19353, %19354, %19355, %19356, %19357, %19358) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %19360 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19361 = "torch.prims.convert_element_type"(%19359, %19360) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %19362 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19363 = "torch.aten.div.Scalar"(%19361, %19362) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19364 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %19365 = "torch.aten.pow.Scalar"(%19364, %19363) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19366 = "torch.aten.reciprocal"(%19365) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19367 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %19368 = "torch.aten.mul.Scalar"(%19366, %19367) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19369 = "torch.aten.reciprocal"(%19368) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19370 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %19371 = "torch.aten.mul.Scalar"(%19369, %19370) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19372 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19373 = "torch.aten.gt.Scalar"(%19371, %19372) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19374 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19375 = "torch.aten.div.Scalar"(%19368, %19374) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19376 = "torch.aten.where.self"(%19373, %19375, %19368) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19377 = "torch.aten.reciprocal"(%19371) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19378 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %19379 = "torch.aten.mul.Scalar"(%19377, %19378) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19380 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19381 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19382 = "torch.aten.sub.Scalar"(%19379, %19380, %19381) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19383 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19384 = "torch.aten.div.Scalar"(%19382, %19383) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19385 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19386 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19387 = "torch.aten.rsub.Scalar"(%19384, %19385, %19386) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19388 = "torch.aten.mul.Tensor"(%19387, %19376) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19389 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19390 = "torch.aten.div.Scalar"(%19388, %19389) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19391 = "torch.aten.mul.Tensor"(%19384, %19376) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19392 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19393 = "torch.aten.add.Tensor"(%19390, %19391, %19392) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19394 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %19395 = "torch.aten.lt.Scalar"(%19371, %19394) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19396 = "torch.aten.bitwise_not"(%19395) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19397 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19398 = "torch.aten.gt.Scalar"(%19371, %19397) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19399 = "torch.aten.bitwise_not"(%19398) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19400 = "torch.aten.mul.Tensor"(%19396, %19399) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19401 = "torch.aten.where.self"(%19400, %19393, %19376) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19402 = "torch.prim.ListConstruct"(%19401, %19401) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %19403 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19404 = "torch.aten.cat"(%19402, %19403) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %19405 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19406 = "torch.prims.convert_element_type"(%19351, %19405) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %19407 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19408 = "torch.prims.convert_element_type"(%19404, %19407) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %19409 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19410 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19411 = "torch.prim.ListConstruct"(%19409, %19410) : (!torch.int, !torch.int) -> !torch.list<int>
    %19412 = "torch.aten.view"(%19406, %19411) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %19413 = "torch.aten.mul.Tensor"(%19412, %19408) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19414 = "torch.aten.cos"(%19413) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19415 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19416 = "torch.prims.convert_element_type"(%19414, %19415) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19417 = "torch.aten.sin"(%19413) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19418 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19419 = "torch.prims.convert_element_type"(%19417, %19418) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19420 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19421 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19422 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19423 = "torch.aten.slice.Tensor"(%19416, %19420, %19421, %18481, %19422) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19423, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19424 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19425 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19426 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19427 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19428 = "torch.aten.slice.Tensor"(%19423, %19424, %19425, %19426, %19427) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19428, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19429 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19430 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19431 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19432 = "torch.aten.slice.Tensor"(%19419, %19429, %19430, %18481, %19431) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19432, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19433 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19434 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19435 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19436 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19437 = "torch.aten.slice.Tensor"(%19432, %19433, %19434, %19435, %19436) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19437, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19438 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19439 = "torch.aten.unsqueeze"(%19428, %19438) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19439, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19441 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19442 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19443 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19444 = "torch.aten.slice.Tensor"(%19439, %19440, %19441, %19442, %19443) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19444, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19445 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19446 = "torch.aten.unsqueeze"(%19444, %19445) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19446, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19447 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19448 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19449 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19450 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19451 = "torch.aten.slice.Tensor"(%19446, %19447, %19448, %19449, %19450) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19451, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19452 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19453 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19454 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19455 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19456 = "torch.prim.ListConstruct"(%19452, %19453, %19454, %19455) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19457 = "torch.aten.repeat"(%19451, %19456) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19457, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19458 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19459 = "torch.aten.unsqueeze"(%19437, %19458) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19459, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19460 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19461 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19462 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19463 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19464 = "torch.aten.slice.Tensor"(%19459, %19460, %19461, %19462, %19463) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19464, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19465 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19466 = "torch.aten.unsqueeze"(%19464, %19465) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19466, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19467 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19468 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19469 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19470 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19471 = "torch.aten.slice.Tensor"(%19466, %19467, %19468, %19469, %19470) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19471, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19472 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19473 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19474 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19475 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19476 = "torch.prim.ListConstruct"(%19472, %19473, %19474, %19475) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19477 = "torch.aten.repeat"(%19471, %19476) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19477, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19478 = "torch.aten.mul.Tensor"(%19335, %19457) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19478, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19479 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19480 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19481 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19482 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19483 = "torch.aten.slice.Tensor"(%19335, %19479, %19480, %19481, %19482) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19483, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19484 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19485 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19486 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19487 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19488 = "torch.aten.slice.Tensor"(%19335, %19484, %19485, %19486, %19487) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19488, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19489 = "torch.aten.neg"(%19488) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19489, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19490 = "torch.prim.ListConstruct"(%19489, %19483) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %19491 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19492 = "torch.aten.cat"(%19490, %19491) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19492, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19493 = "torch.aten.mul.Tensor"(%19492, %19477) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19493, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19494 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19495 = "torch.aten.add.Tensor"(%19478, %19493, %19494) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19495, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19496 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19497 = "torch.constant.none"() : () -> !torch.none
    %19498 = "torch.constant.none"() : () -> !torch.none
    %19499 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19500 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19501 = "torch.aten.arange"(%19496, %19497, %19498, %19499, %19500) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %19502 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19503 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19504 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19505 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19506 = "torch.constant.none"() : () -> !torch.none
    %19507 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %19508 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19509 = "torch.aten.arange.start_step"(%19502, %19503, %19504, %19505, %19506, %19507, %19508) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %19510 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19511 = "torch.prims.convert_element_type"(%19509, %19510) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %19512 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19513 = "torch.aten.div.Scalar"(%19511, %19512) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19514 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %19515 = "torch.aten.pow.Scalar"(%19514, %19513) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19516 = "torch.aten.reciprocal"(%19515) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19517 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %19518 = "torch.aten.mul.Scalar"(%19516, %19517) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19519 = "torch.aten.reciprocal"(%19518) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19520 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %19521 = "torch.aten.mul.Scalar"(%19519, %19520) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %19522 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19523 = "torch.aten.gt.Scalar"(%19521, %19522) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19524 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19525 = "torch.aten.div.Scalar"(%19518, %19524) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19526 = "torch.aten.where.self"(%19523, %19525, %19518) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19527 = "torch.aten.reciprocal"(%19521) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19528 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %19529 = "torch.aten.mul.Scalar"(%19527, %19528) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19530 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19531 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19532 = "torch.aten.sub.Scalar"(%19529, %19530, %19531) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19533 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19534 = "torch.aten.div.Scalar"(%19532, %19533) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19536 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19537 = "torch.aten.rsub.Scalar"(%19534, %19535, %19536) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %19538 = "torch.aten.mul.Tensor"(%19537, %19526) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19539 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19540 = "torch.aten.div.Scalar"(%19538, %19539) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19541 = "torch.aten.mul.Tensor"(%19534, %19526) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19542 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19543 = "torch.aten.add.Tensor"(%19540, %19541, %19542) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %19544 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %19545 = "torch.aten.lt.Scalar"(%19521, %19544) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19546 = "torch.aten.bitwise_not"(%19545) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19547 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %19548 = "torch.aten.gt.Scalar"(%19521, %19547) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %19549 = "torch.aten.bitwise_not"(%19548) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19550 = "torch.aten.mul.Tensor"(%19546, %19549) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %19551 = "torch.aten.where.self"(%19550, %19543, %19526) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %19552 = "torch.prim.ListConstruct"(%19551, %19551) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %19553 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19554 = "torch.aten.cat"(%19552, %19553) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %19555 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19556 = "torch.prims.convert_element_type"(%19501, %19555) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %19557 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19558 = "torch.prims.convert_element_type"(%19554, %19557) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %19559 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %19560 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19561 = "torch.prim.ListConstruct"(%19559, %19560) : (!torch.int, !torch.int) -> !torch.list<int>
    %19562 = "torch.aten.view"(%19556, %19561) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %19563 = "torch.aten.mul.Tensor"(%19562, %19558) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19564 = "torch.aten.cos"(%19563) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19565 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19566 = "torch.prims.convert_element_type"(%19564, %19565) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19567 = "torch.aten.sin"(%19563) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %19568 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19569 = "torch.prims.convert_element_type"(%19567, %19568) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %19570 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19571 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19572 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19573 = "torch.aten.slice.Tensor"(%19566, %19570, %19571, %18481, %19572) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19573, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19574 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19575 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19576 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19577 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19578 = "torch.aten.slice.Tensor"(%19573, %19574, %19575, %19576, %19577) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19578, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19580 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19581 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19582 = "torch.aten.slice.Tensor"(%19569, %19579, %19580, %18481, %19581) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19582, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19583 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19584 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19585 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19586 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19587 = "torch.aten.slice.Tensor"(%19582, %19583, %19584, %19585, %19586) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%19587, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %19588 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19589 = "torch.aten.unsqueeze"(%19578, %19588) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19589, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19590 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19591 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19592 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19593 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19594 = "torch.aten.slice.Tensor"(%19589, %19590, %19591, %19592, %19593) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19594, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19595 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19596 = "torch.aten.unsqueeze"(%19594, %19595) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19596, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19597 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19598 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19599 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19600 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19601 = "torch.aten.slice.Tensor"(%19596, %19597, %19598, %19599, %19600) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19601, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19602 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19603 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19604 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19605 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19606 = "torch.prim.ListConstruct"(%19602, %19603, %19604, %19605) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19607 = "torch.aten.repeat"(%19601, %19606) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19607, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19608 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19609 = "torch.aten.unsqueeze"(%19587, %19608) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19609, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19610 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19611 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19612 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19613 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19614 = "torch.aten.slice.Tensor"(%19609, %19610, %19611, %19612, %19613) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%19614, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %19615 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19616 = "torch.aten.unsqueeze"(%19614, %19615) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19616, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19617 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19618 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19619 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19621 = "torch.aten.slice.Tensor"(%19616, %19617, %19618, %19619, %19620) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19621, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %19622 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19623 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19624 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19625 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19626 = "torch.prim.ListConstruct"(%19622, %19623, %19624, %19625) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19627 = "torch.aten.repeat"(%19621, %19626) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%19627, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %19628 = "torch.aten.mul.Tensor"(%19340, %19607) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19628, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19629 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19630 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19631 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19632 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19633 = "torch.aten.slice.Tensor"(%19340, %19629, %19630, %19631, %19632) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19633, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19634 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %19635 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19636 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %19637 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19638 = "torch.aten.slice.Tensor"(%19340, %19634, %19635, %19636, %19637) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19638, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19639 = "torch.aten.neg"(%19638) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19639, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %19640 = "torch.prim.ListConstruct"(%19639, %19633) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %19641 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19642 = "torch.aten.cat"(%19640, %19641) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19642, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19643 = "torch.aten.mul.Tensor"(%19642, %19627) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19643, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19644 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19645 = "torch.aten.add.Tensor"(%19628, %19643, %19644) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19645, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19646 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %19647 = "torch.aten.mul.Scalar"(%arg69, %19646) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19647, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19648 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19649 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19650 = "torch.aten.add.Scalar"(%19647, %19648, %19649) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19650, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19651 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19652 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19653 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19654 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19655 = "torch.prim.ListConstruct"(%19651, %18477, %19652, %19653, %19654) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19656 = "torch.aten.view"(%19645, %19655) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19656, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19657 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19658 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19659 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19660 = "torch.prim.ListConstruct"(%19011, %19657, %19658, %19659) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19661 = "torch.aten.view"(%19656, %19660) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19661, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19662 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19663 = "torch.aten.view"(%19650, %19662) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19663, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19664 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19665 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19666 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19667 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19668 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19669 = "torch.prim.ListConstruct"(%18479, %19664, %19665, %19666, %19667, %19668) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19670 = "torch.aten.view"(%19072, %19669) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19670, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19671 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19672 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19673 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19674 = "torch.prim.ListConstruct"(%18993, %19671, %19672, %19673) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19675 = "torch.aten.view"(%19670, %19674) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19675, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19676 = "torch.prim.ListConstruct"(%19663) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19677 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19678 = "torch.aten.index_put"(%19675, %19676, %19661, %19677) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19678, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19679 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19680 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19681 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19682 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19683 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19684 = "torch.prim.ListConstruct"(%18479, %19679, %19680, %19681, %19682, %19683) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19685 = "torch.aten.view"(%19678, %19684) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19685, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19686 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19687 = "torch.prim.ListConstruct"(%18479, %19686) : (!torch.int, !torch.int) -> !torch.list<int>
    %19688 = "torch.aten.view"(%19685, %19687) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19688, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19689 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19690 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19691 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19692 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19693 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19694 = "torch.prim.ListConstruct"(%18479, %19689, %19690, %19691, %19692, %19693) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19695 = "torch.aten.view"(%19688, %19694) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19695, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19696 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19697 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19698 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19699 = "torch.prim.ListConstruct"(%18993, %19696, %19697, %19698) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19700 = "torch.aten.view"(%19695, %19699) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19700, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19701 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19702 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19703 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19704 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19705 = "torch.prim.ListConstruct"(%19701, %18477, %19702, %19703, %19704) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19706 = "torch.aten.view"(%19345, %19705) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19706, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19707 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19708 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19709 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19710 = "torch.prim.ListConstruct"(%19011, %19707, %19708, %19709) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19711 = "torch.aten.view"(%19706, %19710) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19711, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19712 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19713 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19714 = "torch.aten.add.Scalar"(%19650, %19712, %19713) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%19714, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %19715 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %19716 = "torch.aten.view"(%19714, %19715) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%19716, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %19717 = "torch.prim.ListConstruct"(%19716) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %19718 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19719 = "torch.aten.index_put"(%19700, %19717, %19711, %19718) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19719, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19720 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19721 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19722 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19723 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19724 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19725 = "torch.prim.ListConstruct"(%18479, %19720, %19721, %19722, %19723, %19724) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19726 = "torch.aten.view"(%19719, %19725) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19726, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19727 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %19728 = "torch.prim.ListConstruct"(%18479, %19727) : (!torch.int, !torch.int) -> !torch.list<int>
    %19729 = "torch.aten.view"(%19726, %19728) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19729, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %19730 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19731 = "torch.aten.unsqueeze"(%19645, %19730) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19731, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19732 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19733 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19734 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19735 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19736 = "torch.prim.ListConstruct"(%19732, %18481, %19733, %19734, %19735) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19737 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19738 = "torch.aten.expand"(%19731, %19736, %19737) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19738, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19739 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19740 = "torch.aten.clone"(%19738, %19739) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19740, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19741 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19742 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19743 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19744 = "torch.prim.ListConstruct"(%19741, %18481, %19742, %19743) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19745 = "torch.aten._unsafe_view"(%19740, %19744) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19745, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19746 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %19747 = "torch.aten.unsqueeze"(%19345, %19746) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19747, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19748 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19749 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19750 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19751 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19752 = "torch.prim.ListConstruct"(%19748, %18481, %19749, %19750, %19751) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19753 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19754 = "torch.aten.expand"(%19747, %19752, %19753) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19754, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19755 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19756 = "torch.aten.clone"(%19754, %19755) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19756, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19757 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19758 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19759 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19760 = "torch.prim.ListConstruct"(%19757, %18481, %19758, %19759) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19761 = "torch.aten._unsafe_view"(%19756, %19760) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19761, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19762 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19763 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19764 = "torch.aten.transpose.int"(%19495, %19762, %19763) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19764, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19765 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19766 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19767 = "torch.aten.transpose.int"(%19745, %19765, %19766) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19767, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19768 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19769 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19770 = "torch.aten.transpose.int"(%19761, %19768, %19769) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19770, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19771 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19772 = "torch.aten.squeeze.dim"(%18570, %19771) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19772, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19773 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19774 = "torch.aten.squeeze.dim"(%19772, %19773) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19774, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %19775 = "torch_c.to_builtin_tensor"(%19764) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19776 = "torch_c.to_builtin_tensor"(%19767) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19777 = "torch_c.to_builtin_tensor"(%19770) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %19778 = "torch_c.to_builtin_tensor"(%19774) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %19779 = "tensor.cast"(%19778) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %19780 = "torch_c.to_builtin_tensor"(%17249) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %19781 = "util.call"(%19775, %19776, %19777, %19780, %19779) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %19782 = "torch_c.from_builtin_tensor"(%19781) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%19782, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %19783 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19784 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19785 = "torch.aten.transpose.int"(%19782, %19783, %19784) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19785, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19786 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19787 = "torch.aten.clone"(%19785, %19786) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%19787, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %19788 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19789 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19790 = "torch.prim.ListConstruct"(%19788, %18481, %19789) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19791 = "torch.aten._unsafe_view"(%19787, %19790) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19791, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19792 = "torch.aten.div.Tensor"(%19791, %17251) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19792, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19793 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19794 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19795 = "torch.aten.clamp"(%19792, %19793, %19794) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19795, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19796 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19797 = "torch.prims.convert_element_type"(%19795, %19796) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19797, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19798 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19799 = "torch.aten.unsqueeze"(%17253, %19798) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19800 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19801 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19802 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19803 = "torch.prim.ListConstruct"(%19800, %19801, %19802) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19804 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19805 = "torch.aten.expand"(%19799, %19803, %19804) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19806 = "torch_c.to_builtin_tensor"(%19797) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19807 = "torch_c.to_builtin_tensor"(%19805) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19808 = "util.call"(%19806, %19807) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19809 = "torch_c.from_builtin_tensor"(%19808) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19809, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19810 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19811 = "torch.prims.convert_element_type"(%19809, %19810) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19811, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19812 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19813 = "torch.aten.add.Tensor"(%19239, %19811, %19812) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19813, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19814 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19815 = "torch.prims.convert_element_type"(%19813, %19814) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19815, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19816 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19817 = "torch.aten.pow.Tensor_Scalar"(%19815, %19816) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19817, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19818 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19819 = "torch.prim.ListConstruct"(%19818) : (!torch.int) -> !torch.list<int>
    %19820 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19821 = "torch.constant.none"() : () -> !torch.none
    %19822 = "torch.aten.mean.dim"(%19817, %19819, %19820, %19821) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19822, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19823 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19824 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19825 = "torch.aten.add.Scalar"(%19822, %19823, %19824) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19825, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19826 = "torch.aten.rsqrt"(%19825) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19826, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19827 = "torch.aten.mul.Tensor"(%19815, %19826) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19827, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19828 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19829 = "torch.prims.convert_element_type"(%19827, %19828) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19829, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19830 = "torch.aten.mul.Tensor"(%17255, %19829) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19830, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19831 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19832 = "torch.prims.convert_element_type"(%19830, %19831) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19832, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19833 = "torch.aten.div.Tensor"(%19832, %17257) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19833, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19834 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19835 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19836 = "torch.aten.clamp"(%19833, %19834, %19835) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19836, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19837 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19838 = "torch.prims.convert_element_type"(%19836, %19837) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19838, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19839 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19840 = "torch.aten.unsqueeze"(%17259, %19839) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19841 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19842 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19843 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19844 = "torch.prim.ListConstruct"(%19841, %19842, %19843) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19845 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19846 = "torch.aten.expand"(%19840, %19844, %19845) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19847 = "torch_c.to_builtin_tensor"(%19838) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19848 = "torch_c.to_builtin_tensor"(%19846) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19849 = "util.call"(%19847, %19848) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19850 = "torch_c.from_builtin_tensor"(%19849) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19850, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19851 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19852 = "torch.prims.convert_element_type"(%19850, %19851) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19852, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19853 = "torch.aten.silu"(%19852) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19853, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19854 = "torch.aten.div.Tensor"(%19832, %17261) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19854, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19855 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19856 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19857 = "torch.aten.clamp"(%19854, %19855, %19856) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19857, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19858 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19859 = "torch.prims.convert_element_type"(%19857, %19858) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19859, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19860 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19861 = "torch.aten.unsqueeze"(%17263, %19860) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %19862 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19863 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19864 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19865 = "torch.prim.ListConstruct"(%19862, %19863, %19864) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19866 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19867 = "torch.aten.expand"(%19861, %19865, %19866) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %19868 = "torch_c.to_builtin_tensor"(%19859) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19869 = "torch_c.to_builtin_tensor"(%19867) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %19870 = "util.call"(%19868, %19869) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %19871 = "torch_c.from_builtin_tensor"(%19870) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%19871, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %19872 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19873 = "torch.prims.convert_element_type"(%19871, %19872) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19873, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19874 = "torch.aten.mul.Tensor"(%19853, %19873) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19874, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19875 = "torch.aten.div.Tensor"(%19874, %17265) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19875, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19876 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19877 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19878 = "torch.aten.clamp"(%19875, %19876, %19877) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%19878, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %19879 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19880 = "torch.prims.convert_element_type"(%19878, %19879) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19880, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %19881 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19882 = "torch.aten.unsqueeze"(%17267, %19881) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %19883 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19884 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19885 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %19886 = "torch.prim.ListConstruct"(%19883, %19884, %19885) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19887 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19888 = "torch.aten.expand"(%19882, %19886, %19887) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %19889 = "torch_c.to_builtin_tensor"(%19880) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %19890 = "torch_c.to_builtin_tensor"(%19888) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %19891 = "util.call"(%19889, %19890) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19892 = "torch_c.from_builtin_tensor"(%19891) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19892, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19893 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19894 = "torch.prims.convert_element_type"(%19892, %19893) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19894, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19895 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19896 = "torch.aten.add.Tensor"(%19813, %19894, %19895) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19896, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19897 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %19898 = "torch.prims.convert_element_type"(%19896, %19897) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19898, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19899 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %19900 = "torch.aten.pow.Tensor_Scalar"(%19898, %19899) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19900, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19901 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %19902 = "torch.prim.ListConstruct"(%19901) : (!torch.int) -> !torch.list<int>
    %19903 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %19904 = "torch.constant.none"() : () -> !torch.none
    %19905 = "torch.aten.mean.dim"(%19900, %19902, %19903, %19904) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19905, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19906 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %19907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %19908 = "torch.aten.add.Scalar"(%19905, %19906, %19907) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19908, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19909 = "torch.aten.rsqrt"(%19908) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%19909, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %19910 = "torch.aten.mul.Tensor"(%19898, %19909) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19910, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19911 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19912 = "torch.prims.convert_element_type"(%19910, %19911) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19912, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19913 = "torch.aten.mul.Tensor"(%17269, %19912) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19913, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19914 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %19915 = "torch.prims.convert_element_type"(%19913, %19914) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19915, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19916 = "torch.aten.div.Tensor"(%19915, %17271) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19916, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19917 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19918 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19919 = "torch.aten.clamp"(%19916, %19917, %19918) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19919, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19920 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19921 = "torch.prims.convert_element_type"(%19919, %19920) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19921, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19922 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19923 = "torch.aten.unsqueeze"(%17273, %19922) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %19924 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19925 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19926 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19927 = "torch.prim.ListConstruct"(%19924, %19925, %19926) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19928 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19929 = "torch.aten.expand"(%19923, %19927, %19928) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %19930 = "torch_c.to_builtin_tensor"(%19921) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19931 = "torch_c.to_builtin_tensor"(%19929) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %19932 = "util.call"(%19930, %19931) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %19933 = "torch_c.from_builtin_tensor"(%19932) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19933, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19934 = "torch.aten.div.Tensor"(%19933, %17275) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19934, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19935 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19936 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19937 = "torch.aten.clamp"(%19934, %19935, %19936) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%19937, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %19938 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19939 = "torch.prims.convert_element_type"(%19937, %19938) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19939, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19940 = "torch.aten.div.Tensor"(%19915, %17277) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19940, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19941 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19942 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19943 = "torch.aten.clamp"(%19940, %19941, %19942) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19943, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19944 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19945 = "torch.prims.convert_element_type"(%19943, %19944) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19945, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19946 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19947 = "torch.aten.unsqueeze"(%17279, %19946) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19948 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19949 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19950 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19951 = "torch.prim.ListConstruct"(%19948, %19949, %19950) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19952 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19953 = "torch.aten.expand"(%19947, %19951, %19952) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19954 = "torch_c.to_builtin_tensor"(%19945) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19955 = "torch_c.to_builtin_tensor"(%19953) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19956 = "util.call"(%19954, %19955) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19957 = "torch_c.from_builtin_tensor"(%19956) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19957, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19958 = "torch.aten.div.Tensor"(%19957, %17281) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19958, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19959 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19960 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19961 = "torch.aten.clamp"(%19958, %19959, %19960) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19961, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19962 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19963 = "torch.prims.convert_element_type"(%19961, %19962) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19963, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19964 = "torch.aten.div.Tensor"(%19915, %17283) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19964, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19965 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19966 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19967 = "torch.aten.clamp"(%19964, %19965, %19966) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%19967, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %19968 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19969 = "torch.prims.convert_element_type"(%19967, %19968) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19969, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %19970 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %19971 = "torch.aten.unsqueeze"(%17285, %19970) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %19972 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19973 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %19974 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %19975 = "torch.prim.ListConstruct"(%19972, %19973, %19974) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19976 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %19977 = "torch.aten.expand"(%19971, %19975, %19976) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %19978 = "torch_c.to_builtin_tensor"(%19969) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %19979 = "torch_c.to_builtin_tensor"(%19977) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %19980 = "util.call"(%19978, %19979) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %19981 = "torch_c.from_builtin_tensor"(%19980) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19981, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19982 = "torch.aten.div.Tensor"(%19981, %17287) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19982, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19983 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %19984 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %19985 = "torch.aten.clamp"(%19982, %19983, %19984) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%19985, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %19986 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %19987 = "torch.prims.convert_element_type"(%19985, %19986) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19987, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %19988 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19989 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %19990 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19991 = "torch.prim.ListConstruct"(%19988, %18481, %19989, %19990) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19992 = "torch.aten.view"(%19939, %19991) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19992, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19993 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19994 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %19995 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %19996 = "torch.prim.ListConstruct"(%19993, %18481, %19994, %19995) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %19997 = "torch.aten.view"(%19963, %19996) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%19997, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %19998 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %19999 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20000 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20001 = "torch.prim.ListConstruct"(%19998, %18481, %19999, %20000) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20002 = "torch.aten.view"(%19987, %20001) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20002, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20003 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20004 = "torch.constant.none"() : () -> !torch.none
    %20005 = "torch.constant.none"() : () -> !torch.none
    %20006 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20007 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20008 = "torch.aten.arange"(%20003, %20004, %20005, %20006, %20007) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20009 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20010 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20011 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20012 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20013 = "torch.constant.none"() : () -> !torch.none
    %20014 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20015 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20016 = "torch.aten.arange.start_step"(%20009, %20010, %20011, %20012, %20013, %20014, %20015) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20017 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20018 = "torch.prims.convert_element_type"(%20016, %20017) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20019 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20020 = "torch.aten.div.Scalar"(%20018, %20019) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20021 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20022 = "torch.aten.pow.Scalar"(%20021, %20020) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20023 = "torch.aten.reciprocal"(%20022) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20024 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20025 = "torch.aten.mul.Scalar"(%20023, %20024) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20026 = "torch.aten.reciprocal"(%20025) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20027 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20028 = "torch.aten.mul.Scalar"(%20026, %20027) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20029 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20030 = "torch.aten.gt.Scalar"(%20028, %20029) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20031 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20032 = "torch.aten.div.Scalar"(%20025, %20031) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20033 = "torch.aten.where.self"(%20030, %20032, %20025) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20034 = "torch.aten.reciprocal"(%20028) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20035 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20036 = "torch.aten.mul.Scalar"(%20034, %20035) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20037 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20038 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20039 = "torch.aten.sub.Scalar"(%20036, %20037, %20038) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20040 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20041 = "torch.aten.div.Scalar"(%20039, %20040) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20042 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20043 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20044 = "torch.aten.rsub.Scalar"(%20041, %20042, %20043) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20045 = "torch.aten.mul.Tensor"(%20044, %20033) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20046 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20047 = "torch.aten.div.Scalar"(%20045, %20046) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20048 = "torch.aten.mul.Tensor"(%20041, %20033) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20049 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20050 = "torch.aten.add.Tensor"(%20047, %20048, %20049) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20051 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20052 = "torch.aten.lt.Scalar"(%20028, %20051) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20053 = "torch.aten.bitwise_not"(%20052) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20054 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20055 = "torch.aten.gt.Scalar"(%20028, %20054) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20056 = "torch.aten.bitwise_not"(%20055) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20057 = "torch.aten.mul.Tensor"(%20053, %20056) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20058 = "torch.aten.where.self"(%20057, %20050, %20033) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20059 = "torch.prim.ListConstruct"(%20058, %20058) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20060 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20061 = "torch.aten.cat"(%20059, %20060) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20062 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20063 = "torch.prims.convert_element_type"(%20008, %20062) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20064 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20065 = "torch.prims.convert_element_type"(%20061, %20064) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20066 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20067 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20068 = "torch.prim.ListConstruct"(%20066, %20067) : (!torch.int, !torch.int) -> !torch.list<int>
    %20069 = "torch.aten.view"(%20063, %20068) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20070 = "torch.aten.mul.Tensor"(%20069, %20065) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20071 = "torch.aten.cos"(%20070) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20072 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20073 = "torch.prims.convert_element_type"(%20071, %20072) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20074 = "torch.aten.sin"(%20070) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20075 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20076 = "torch.prims.convert_element_type"(%20074, %20075) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20077 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20078 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20079 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20080 = "torch.aten.slice.Tensor"(%20073, %20077, %20078, %18481, %20079) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20080, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20081 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20082 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20083 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20084 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20085 = "torch.aten.slice.Tensor"(%20080, %20081, %20082, %20083, %20084) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20085, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20086 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20087 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20088 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20089 = "torch.aten.slice.Tensor"(%20076, %20086, %20087, %18481, %20088) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20089, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20090 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20091 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20092 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20093 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20094 = "torch.aten.slice.Tensor"(%20089, %20090, %20091, %20092, %20093) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20094, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20095 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20096 = "torch.aten.unsqueeze"(%20085, %20095) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20096, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20097 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20098 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20099 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20100 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20101 = "torch.aten.slice.Tensor"(%20096, %20097, %20098, %20099, %20100) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20101, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20102 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20103 = "torch.aten.unsqueeze"(%20101, %20102) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20103, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20104 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20105 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20106 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20107 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20108 = "torch.aten.slice.Tensor"(%20103, %20104, %20105, %20106, %20107) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20108, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20109 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20110 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20111 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20112 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20113 = "torch.prim.ListConstruct"(%20109, %20110, %20111, %20112) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20114 = "torch.aten.repeat"(%20108, %20113) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20114, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20115 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20116 = "torch.aten.unsqueeze"(%20094, %20115) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20116, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20117 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20118 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20119 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20120 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20121 = "torch.aten.slice.Tensor"(%20116, %20117, %20118, %20119, %20120) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20121, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20122 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20123 = "torch.aten.unsqueeze"(%20121, %20122) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20123, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20124 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20125 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20126 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20127 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20128 = "torch.aten.slice.Tensor"(%20123, %20124, %20125, %20126, %20127) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20128, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20129 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20130 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20131 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20132 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20133 = "torch.prim.ListConstruct"(%20129, %20130, %20131, %20132) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20134 = "torch.aten.repeat"(%20128, %20133) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20134, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20135 = "torch.aten.mul.Tensor"(%19992, %20114) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20135, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20136 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20137 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20138 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20139 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20140 = "torch.aten.slice.Tensor"(%19992, %20136, %20137, %20138, %20139) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20140, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20141 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20142 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20143 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20144 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20145 = "torch.aten.slice.Tensor"(%19992, %20141, %20142, %20143, %20144) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20145, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20146 = "torch.aten.neg"(%20145) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20146, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20147 = "torch.prim.ListConstruct"(%20146, %20140) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20148 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20149 = "torch.aten.cat"(%20147, %20148) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20149, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20150 = "torch.aten.mul.Tensor"(%20149, %20134) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20150, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20151 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20152 = "torch.aten.add.Tensor"(%20135, %20150, %20151) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20152, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20153 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20154 = "torch.constant.none"() : () -> !torch.none
    %20155 = "torch.constant.none"() : () -> !torch.none
    %20156 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20157 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20158 = "torch.aten.arange"(%20153, %20154, %20155, %20156, %20157) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20159 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20160 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20161 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20162 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20163 = "torch.constant.none"() : () -> !torch.none
    %20164 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20165 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20166 = "torch.aten.arange.start_step"(%20159, %20160, %20161, %20162, %20163, %20164, %20165) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20167 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20168 = "torch.prims.convert_element_type"(%20166, %20167) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20169 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20170 = "torch.aten.div.Scalar"(%20168, %20169) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20171 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20172 = "torch.aten.pow.Scalar"(%20171, %20170) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20173 = "torch.aten.reciprocal"(%20172) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20174 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20175 = "torch.aten.mul.Scalar"(%20173, %20174) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20176 = "torch.aten.reciprocal"(%20175) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20177 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20178 = "torch.aten.mul.Scalar"(%20176, %20177) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20179 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20180 = "torch.aten.gt.Scalar"(%20178, %20179) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20181 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20182 = "torch.aten.div.Scalar"(%20175, %20181) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20183 = "torch.aten.where.self"(%20180, %20182, %20175) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20184 = "torch.aten.reciprocal"(%20178) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20185 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20186 = "torch.aten.mul.Scalar"(%20184, %20185) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20187 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20188 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20189 = "torch.aten.sub.Scalar"(%20186, %20187, %20188) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20190 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20191 = "torch.aten.div.Scalar"(%20189, %20190) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20192 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20193 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20194 = "torch.aten.rsub.Scalar"(%20191, %20192, %20193) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20195 = "torch.aten.mul.Tensor"(%20194, %20183) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20196 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20197 = "torch.aten.div.Scalar"(%20195, %20196) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20198 = "torch.aten.mul.Tensor"(%20191, %20183) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20199 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20200 = "torch.aten.add.Tensor"(%20197, %20198, %20199) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20201 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20202 = "torch.aten.lt.Scalar"(%20178, %20201) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20203 = "torch.aten.bitwise_not"(%20202) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20204 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20205 = "torch.aten.gt.Scalar"(%20178, %20204) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20206 = "torch.aten.bitwise_not"(%20205) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20207 = "torch.aten.mul.Tensor"(%20203, %20206) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20208 = "torch.aten.where.self"(%20207, %20200, %20183) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20209 = "torch.prim.ListConstruct"(%20208, %20208) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20210 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20211 = "torch.aten.cat"(%20209, %20210) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20212 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20213 = "torch.prims.convert_element_type"(%20158, %20212) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20214 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20215 = "torch.prims.convert_element_type"(%20211, %20214) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20216 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20217 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20218 = "torch.prim.ListConstruct"(%20216, %20217) : (!torch.int, !torch.int) -> !torch.list<int>
    %20219 = "torch.aten.view"(%20213, %20218) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20220 = "torch.aten.mul.Tensor"(%20219, %20215) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20221 = "torch.aten.cos"(%20220) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20222 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20223 = "torch.prims.convert_element_type"(%20221, %20222) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20224 = "torch.aten.sin"(%20220) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20225 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20226 = "torch.prims.convert_element_type"(%20224, %20225) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20227 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20228 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20229 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20230 = "torch.aten.slice.Tensor"(%20223, %20227, %20228, %18481, %20229) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20230, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20232 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20233 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20234 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20235 = "torch.aten.slice.Tensor"(%20230, %20231, %20232, %20233, %20234) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20235, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20237 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20239 = "torch.aten.slice.Tensor"(%20226, %20236, %20237, %18481, %20238) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20239, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20240 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20241 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20242 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20243 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20244 = "torch.aten.slice.Tensor"(%20239, %20240, %20241, %20242, %20243) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20244, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20245 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20246 = "torch.aten.unsqueeze"(%20235, %20245) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20246, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20247 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20248 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20249 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20250 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20251 = "torch.aten.slice.Tensor"(%20246, %20247, %20248, %20249, %20250) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20251, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20252 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20253 = "torch.aten.unsqueeze"(%20251, %20252) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20253, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20254 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20255 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20256 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20257 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20258 = "torch.aten.slice.Tensor"(%20253, %20254, %20255, %20256, %20257) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20258, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20259 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20260 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20261 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20262 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20263 = "torch.prim.ListConstruct"(%20259, %20260, %20261, %20262) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20264 = "torch.aten.repeat"(%20258, %20263) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20264, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20265 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20266 = "torch.aten.unsqueeze"(%20244, %20265) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20266, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20267 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20268 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20269 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20270 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20271 = "torch.aten.slice.Tensor"(%20266, %20267, %20268, %20269, %20270) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20271, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20272 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20273 = "torch.aten.unsqueeze"(%20271, %20272) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20273, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20274 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20275 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20276 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20278 = "torch.aten.slice.Tensor"(%20273, %20274, %20275, %20276, %20277) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20278, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20279 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20280 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20281 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20282 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20283 = "torch.prim.ListConstruct"(%20279, %20280, %20281, %20282) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20284 = "torch.aten.repeat"(%20278, %20283) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20284, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20285 = "torch.aten.mul.Tensor"(%19997, %20264) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20285, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20286 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20287 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20288 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20289 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20290 = "torch.aten.slice.Tensor"(%19997, %20286, %20287, %20288, %20289) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20290, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20291 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20292 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20293 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20294 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20295 = "torch.aten.slice.Tensor"(%19997, %20291, %20292, %20293, %20294) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20295, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20296 = "torch.aten.neg"(%20295) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20296, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20297 = "torch.prim.ListConstruct"(%20296, %20290) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20298 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20299 = "torch.aten.cat"(%20297, %20298) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20299, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20300 = "torch.aten.mul.Tensor"(%20299, %20284) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20300, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20301 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20302 = "torch.aten.add.Tensor"(%20285, %20300, %20301) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20302, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20303 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20304 = "torch.aten.mul.Scalar"(%arg69, %20303) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20304, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20305 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20306 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20307 = "torch.aten.add.Scalar"(%20304, %20305, %20306) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20307, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20308 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20309 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20310 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20311 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20312 = "torch.prim.ListConstruct"(%20308, %18477, %20309, %20310, %20311) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20313 = "torch.aten.view"(%20302, %20312) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20313, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20314 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20315 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20316 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20317 = "torch.prim.ListConstruct"(%19011, %20314, %20315, %20316) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20318 = "torch.aten.view"(%20313, %20317) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20318, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20319 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %20320 = "torch.aten.view"(%20307, %20319) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%20320, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %20321 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20322 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20323 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20324 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20325 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20326 = "torch.prim.ListConstruct"(%18479, %20321, %20322, %20323, %20324, %20325) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20327 = "torch.aten.view"(%19729, %20326) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20327, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20328 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20329 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20330 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20331 = "torch.prim.ListConstruct"(%18993, %20328, %20329, %20330) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20332 = "torch.aten.view"(%20327, %20331) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20332, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20333 = "torch.prim.ListConstruct"(%20320) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %20334 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20335 = "torch.aten.index_put"(%20332, %20333, %20318, %20334) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20335, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20336 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20337 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20338 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20339 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20340 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20341 = "torch.prim.ListConstruct"(%18479, %20336, %20337, %20338, %20339, %20340) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20342 = "torch.aten.view"(%20335, %20341) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20342, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20343 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %20344 = "torch.prim.ListConstruct"(%18479, %20343) : (!torch.int, !torch.int) -> !torch.list<int>
    %20345 = "torch.aten.view"(%20342, %20344) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20345, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %20346 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20347 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20348 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20349 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20350 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20351 = "torch.prim.ListConstruct"(%18479, %20346, %20347, %20348, %20349, %20350) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20352 = "torch.aten.view"(%20345, %20351) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20352, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20353 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20354 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20355 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20356 = "torch.prim.ListConstruct"(%18993, %20353, %20354, %20355) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20357 = "torch.aten.view"(%20352, %20356) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20357, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20358 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20359 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20360 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20361 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20362 = "torch.prim.ListConstruct"(%20358, %18477, %20359, %20360, %20361) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20363 = "torch.aten.view"(%20002, %20362) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20363, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20364 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20365 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20366 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20367 = "torch.prim.ListConstruct"(%19011, %20364, %20365, %20366) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20368 = "torch.aten.view"(%20363, %20367) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20368, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20369 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20370 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20371 = "torch.aten.add.Scalar"(%20307, %20369, %20370) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20371, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20372 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %20373 = "torch.aten.view"(%20371, %20372) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%20373, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %20374 = "torch.prim.ListConstruct"(%20373) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %20375 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20376 = "torch.aten.index_put"(%20357, %20374, %20368, %20375) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20376, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20377 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20378 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20379 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20380 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20381 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20382 = "torch.prim.ListConstruct"(%18479, %20377, %20378, %20379, %20380, %20381) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20383 = "torch.aten.view"(%20376, %20382) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20383, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20384 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %20385 = "torch.prim.ListConstruct"(%18479, %20384) : (!torch.int, !torch.int) -> !torch.list<int>
    %20386 = "torch.aten.view"(%20383, %20385) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20386, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %20387 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %20388 = "torch.aten.unsqueeze"(%20302, %20387) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20388, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20389 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20390 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20391 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20392 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20393 = "torch.prim.ListConstruct"(%20389, %18481, %20390, %20391, %20392) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20394 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20395 = "torch.aten.expand"(%20388, %20393, %20394) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20395, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20396 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20397 = "torch.aten.clone"(%20395, %20396) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20397, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20398 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20399 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20400 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20401 = "torch.prim.ListConstruct"(%20398, %18481, %20399, %20400) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20402 = "torch.aten._unsafe_view"(%20397, %20401) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20402, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20403 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %20404 = "torch.aten.unsqueeze"(%20002, %20403) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20404, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20405 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20406 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20407 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20408 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20409 = "torch.prim.ListConstruct"(%20405, %18481, %20406, %20407, %20408) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20410 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20411 = "torch.aten.expand"(%20404, %20409, %20410) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20411, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20412 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20413 = "torch.aten.clone"(%20411, %20412) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20413, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20414 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20415 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20416 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20417 = "torch.prim.ListConstruct"(%20414, %18481, %20415, %20416) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20418 = "torch.aten._unsafe_view"(%20413, %20417) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20418, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20419 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20420 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20421 = "torch.aten.transpose.int"(%20152, %20419, %20420) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20421, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20422 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20423 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20424 = "torch.aten.transpose.int"(%20402, %20422, %20423) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20424, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20425 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20426 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20427 = "torch.aten.transpose.int"(%20418, %20425, %20426) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20427, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20428 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20429 = "torch.aten.squeeze.dim"(%18570, %20428) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20429, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %20430 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20431 = "torch.aten.squeeze.dim"(%20429, %20430) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20431, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %20432 = "torch_c.to_builtin_tensor"(%20421) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %20433 = "torch_c.to_builtin_tensor"(%20424) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %20434 = "torch_c.to_builtin_tensor"(%20427) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %20435 = "torch_c.to_builtin_tensor"(%20431) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %20436 = "tensor.cast"(%20435) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %20437 = "torch_c.to_builtin_tensor"(%17289) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %20438 = "util.call"(%20432, %20433, %20434, %20437, %20436) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %20439 = "torch_c.from_builtin_tensor"(%20438) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%20439, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %20440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20441 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20442 = "torch.aten.transpose.int"(%20439, %20440, %20441) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%20442, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %20443 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20444 = "torch.aten.clone"(%20442, %20443) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%20444, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %20445 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20446 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20447 = "torch.prim.ListConstruct"(%20445, %18481, %20446) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20448 = "torch.aten._unsafe_view"(%20444, %20447) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20448, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20449 = "torch.aten.div.Tensor"(%20448, %17291) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20449, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20450 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20451 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20452 = "torch.aten.clamp"(%20449, %20450, %20451) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20452, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20453 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20454 = "torch.prims.convert_element_type"(%20452, %20453) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20454, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20455 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20456 = "torch.aten.unsqueeze"(%17293, %20455) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %20457 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20458 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20459 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20460 = "torch.prim.ListConstruct"(%20457, %20458, %20459) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20461 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20462 = "torch.aten.expand"(%20456, %20460, %20461) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %20463 = "torch_c.to_builtin_tensor"(%20454) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20464 = "torch_c.to_builtin_tensor"(%20462) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %20465 = "util.call"(%20463, %20464) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %20466 = "torch_c.from_builtin_tensor"(%20465) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20466, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20467 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20468 = "torch.prims.convert_element_type"(%20466, %20467) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20468, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20469 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20470 = "torch.aten.add.Tensor"(%19896, %20468, %20469) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20470, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20471 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20472 = "torch.prims.convert_element_type"(%20470, %20471) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20472, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20473 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20474 = "torch.aten.pow.Tensor_Scalar"(%20472, %20473) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20474, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20475 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20476 = "torch.prim.ListConstruct"(%20475) : (!torch.int) -> !torch.list<int>
    %20477 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %20478 = "torch.constant.none"() : () -> !torch.none
    %20479 = "torch.aten.mean.dim"(%20474, %20476, %20477, %20478) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20479, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20480 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %20481 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20482 = "torch.aten.add.Scalar"(%20479, %20480, %20481) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20482, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20483 = "torch.aten.rsqrt"(%20482) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20483, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20484 = "torch.aten.mul.Tensor"(%20472, %20483) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20484, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20485 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20486 = "torch.prims.convert_element_type"(%20484, %20485) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20486, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20487 = "torch.aten.mul.Tensor"(%17295, %20486) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20487, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20488 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20489 = "torch.prims.convert_element_type"(%20487, %20488) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20489, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20490 = "torch.aten.div.Tensor"(%20489, %17297) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20490, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20491 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20492 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20493 = "torch.aten.clamp"(%20490, %20491, %20492) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20493, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20494 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20495 = "torch.prims.convert_element_type"(%20493, %20494) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20495, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20496 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20497 = "torch.aten.unsqueeze"(%17299, %20496) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %20498 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20499 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %20500 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20501 = "torch.prim.ListConstruct"(%20498, %20499, %20500) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20502 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20503 = "torch.aten.expand"(%20497, %20501, %20502) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %20504 = "torch_c.to_builtin_tensor"(%20495) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20505 = "torch_c.to_builtin_tensor"(%20503) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %20506 = "util.call"(%20504, %20505) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %20507 = "torch_c.from_builtin_tensor"(%20506) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%20507, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %20508 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20509 = "torch.prims.convert_element_type"(%20507, %20508) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20509, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20510 = "torch.aten.silu"(%20509) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20510, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20511 = "torch.aten.div.Tensor"(%20489, %17301) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20511, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20512 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20513 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20514 = "torch.aten.clamp"(%20511, %20512, %20513) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20514, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20515 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20516 = "torch.prims.convert_element_type"(%20514, %20515) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20516, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20517 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20518 = "torch.aten.unsqueeze"(%17303, %20517) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %20519 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20520 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %20521 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20522 = "torch.prim.ListConstruct"(%20519, %20520, %20521) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20523 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20524 = "torch.aten.expand"(%20518, %20522, %20523) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %20525 = "torch_c.to_builtin_tensor"(%20516) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20526 = "torch_c.to_builtin_tensor"(%20524) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %20527 = "util.call"(%20525, %20526) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %20528 = "torch_c.from_builtin_tensor"(%20527) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%20528, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %20529 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20530 = "torch.prims.convert_element_type"(%20528, %20529) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20530, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20531 = "torch.aten.mul.Tensor"(%20510, %20530) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20531, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20532 = "torch.aten.div.Tensor"(%20531, %17305) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20532, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20533 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20534 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20535 = "torch.aten.clamp"(%20532, %20533, %20534) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%20535, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %20536 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20537 = "torch.prims.convert_element_type"(%20535, %20536) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20537, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %20538 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20539 = "torch.aten.unsqueeze"(%17307, %20538) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %20540 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20541 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20542 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %20543 = "torch.prim.ListConstruct"(%20540, %20541, %20542) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20544 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20545 = "torch.aten.expand"(%20539, %20543, %20544) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %20546 = "torch_c.to_builtin_tensor"(%20537) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %20547 = "torch_c.to_builtin_tensor"(%20545) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %20548 = "util.call"(%20546, %20547) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %20549 = "torch_c.from_builtin_tensor"(%20548) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20549, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20550 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20551 = "torch.prims.convert_element_type"(%20549, %20550) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20551, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20552 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20553 = "torch.aten.add.Tensor"(%20470, %20551, %20552) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20553, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20554 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20555 = "torch.prims.convert_element_type"(%20553, %20554) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20555, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20556 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20557 = "torch.aten.pow.Tensor_Scalar"(%20555, %20556) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20557, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20558 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20559 = "torch.prim.ListConstruct"(%20558) : (!torch.int) -> !torch.list<int>
    %20560 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %20561 = "torch.constant.none"() : () -> !torch.none
    %20562 = "torch.aten.mean.dim"(%20557, %20559, %20560, %20561) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20562, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20563 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %20564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20565 = "torch.aten.add.Scalar"(%20562, %20563, %20564) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20565, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20566 = "torch.aten.rsqrt"(%20565) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%20566, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %20567 = "torch.aten.mul.Tensor"(%20555, %20566) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20567, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20568 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20569 = "torch.prims.convert_element_type"(%20567, %20568) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20569, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20570 = "torch.aten.mul.Tensor"(%17309, %20569) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20570, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20571 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20572 = "torch.prims.convert_element_type"(%20570, %20571) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20572, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20573 = "torch.aten.div.Tensor"(%20572, %17311) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20573, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20574 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20575 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20576 = "torch.aten.clamp"(%20573, %20574, %20575) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20576, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20577 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20578 = "torch.prims.convert_element_type"(%20576, %20577) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20578, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20580 = "torch.aten.unsqueeze"(%17313, %20579) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %20581 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20582 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20583 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20584 = "torch.prim.ListConstruct"(%20581, %20582, %20583) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20585 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20586 = "torch.aten.expand"(%20580, %20584, %20585) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %20587 = "torch_c.to_builtin_tensor"(%20578) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20588 = "torch_c.to_builtin_tensor"(%20586) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %20589 = "util.call"(%20587, %20588) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %20590 = "torch_c.from_builtin_tensor"(%20589) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20590, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20591 = "torch.aten.div.Tensor"(%20590, %17315) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20591, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20592 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20593 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20594 = "torch.aten.clamp"(%20591, %20592, %20593) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%20594, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %20595 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20596 = "torch.prims.convert_element_type"(%20594, %20595) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20596, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20597 = "torch.aten.div.Tensor"(%20572, %17317) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20597, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20598 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20599 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20600 = "torch.aten.clamp"(%20597, %20598, %20599) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20600, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20601 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20602 = "torch.prims.convert_element_type"(%20600, %20601) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20602, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20603 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20604 = "torch.aten.unsqueeze"(%17319, %20603) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %20605 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20606 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %20607 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20608 = "torch.prim.ListConstruct"(%20605, %20606, %20607) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20609 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20610 = "torch.aten.expand"(%20604, %20608, %20609) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %20611 = "torch_c.to_builtin_tensor"(%20602) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20612 = "torch_c.to_builtin_tensor"(%20610) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %20613 = "util.call"(%20611, %20612) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %20614 = "torch_c.from_builtin_tensor"(%20613) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20614, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20615 = "torch.aten.div.Tensor"(%20614, %17321) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20615, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20616 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20617 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20618 = "torch.aten.clamp"(%20615, %20616, %20617) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20618, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20619 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20620 = "torch.prims.convert_element_type"(%20618, %20619) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20620, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %20621 = "torch.aten.div.Tensor"(%20572, %17323) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20621, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20622 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20623 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20624 = "torch.aten.clamp"(%20621, %20622, %20623) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%20624, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %20625 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20626 = "torch.prims.convert_element_type"(%20624, %20625) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20626, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %20627 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20628 = "torch.aten.unsqueeze"(%17325, %20627) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %20629 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20630 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %20631 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %20632 = "torch.prim.ListConstruct"(%20629, %20630, %20631) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20633 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20634 = "torch.aten.expand"(%20628, %20632, %20633) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %20635 = "torch_c.to_builtin_tensor"(%20626) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %20636 = "torch_c.to_builtin_tensor"(%20634) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %20637 = "util.call"(%20635, %20636) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %20638 = "torch_c.from_builtin_tensor"(%20637) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20638, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20639 = "torch.aten.div.Tensor"(%20638, %17327) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20639, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20640 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %20641 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %20642 = "torch.aten.clamp"(%20639, %20640, %20641) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%20642, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %20643 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %20644 = "torch.prims.convert_element_type"(%20642, %20643) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20644, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %20645 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20646 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20647 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20648 = "torch.prim.ListConstruct"(%20645, %18481, %20646, %20647) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20649 = "torch.aten.view"(%20596, %20648) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20649, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20650 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20651 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20652 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20653 = "torch.prim.ListConstruct"(%20650, %18481, %20651, %20652) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20654 = "torch.aten.view"(%20620, %20653) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20654, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20655 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20656 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20657 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20658 = "torch.prim.ListConstruct"(%20655, %18481, %20656, %20657) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20659 = "torch.aten.view"(%20644, %20658) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20659, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20660 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20661 = "torch.constant.none"() : () -> !torch.none
    %20662 = "torch.constant.none"() : () -> !torch.none
    %20663 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20664 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20665 = "torch.aten.arange"(%20660, %20661, %20662, %20663, %20664) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20666 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20667 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20668 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20669 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20670 = "torch.constant.none"() : () -> !torch.none
    %20671 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20672 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20673 = "torch.aten.arange.start_step"(%20666, %20667, %20668, %20669, %20670, %20671, %20672) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20674 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20675 = "torch.prims.convert_element_type"(%20673, %20674) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20676 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20677 = "torch.aten.div.Scalar"(%20675, %20676) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20678 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20679 = "torch.aten.pow.Scalar"(%20678, %20677) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20680 = "torch.aten.reciprocal"(%20679) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20681 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20682 = "torch.aten.mul.Scalar"(%20680, %20681) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20683 = "torch.aten.reciprocal"(%20682) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20684 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20685 = "torch.aten.mul.Scalar"(%20683, %20684) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20686 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20687 = "torch.aten.gt.Scalar"(%20685, %20686) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20688 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20689 = "torch.aten.div.Scalar"(%20682, %20688) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20690 = "torch.aten.where.self"(%20687, %20689, %20682) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20691 = "torch.aten.reciprocal"(%20685) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20692 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20693 = "torch.aten.mul.Scalar"(%20691, %20692) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20694 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20695 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20696 = "torch.aten.sub.Scalar"(%20693, %20694, %20695) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20697 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20698 = "torch.aten.div.Scalar"(%20696, %20697) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20699 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20700 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20701 = "torch.aten.rsub.Scalar"(%20698, %20699, %20700) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20702 = "torch.aten.mul.Tensor"(%20701, %20690) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20703 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20704 = "torch.aten.div.Scalar"(%20702, %20703) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20705 = "torch.aten.mul.Tensor"(%20698, %20690) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20706 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20707 = "torch.aten.add.Tensor"(%20704, %20705, %20706) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20708 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20709 = "torch.aten.lt.Scalar"(%20685, %20708) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20710 = "torch.aten.bitwise_not"(%20709) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20711 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20712 = "torch.aten.gt.Scalar"(%20685, %20711) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20713 = "torch.aten.bitwise_not"(%20712) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20714 = "torch.aten.mul.Tensor"(%20710, %20713) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20715 = "torch.aten.where.self"(%20714, %20707, %20690) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20716 = "torch.prim.ListConstruct"(%20715, %20715) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20717 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20718 = "torch.aten.cat"(%20716, %20717) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20719 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20720 = "torch.prims.convert_element_type"(%20665, %20719) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20721 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20722 = "torch.prims.convert_element_type"(%20718, %20721) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20723 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20724 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20725 = "torch.prim.ListConstruct"(%20723, %20724) : (!torch.int, !torch.int) -> !torch.list<int>
    %20726 = "torch.aten.view"(%20720, %20725) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20727 = "torch.aten.mul.Tensor"(%20726, %20722) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20728 = "torch.aten.cos"(%20727) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20729 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20730 = "torch.prims.convert_element_type"(%20728, %20729) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20731 = "torch.aten.sin"(%20727) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20732 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20733 = "torch.prims.convert_element_type"(%20731, %20732) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20734 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20735 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20736 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20737 = "torch.aten.slice.Tensor"(%20730, %20734, %20735, %18481, %20736) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20737, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20738 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20739 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20740 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20741 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20742 = "torch.aten.slice.Tensor"(%20737, %20738, %20739, %20740, %20741) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20742, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20743 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20744 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20745 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20746 = "torch.aten.slice.Tensor"(%20733, %20743, %20744, %18481, %20745) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20746, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20747 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20748 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20749 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20750 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20751 = "torch.aten.slice.Tensor"(%20746, %20747, %20748, %20749, %20750) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20751, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20752 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20753 = "torch.aten.unsqueeze"(%20742, %20752) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20753, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20754 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20755 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20756 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20757 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20758 = "torch.aten.slice.Tensor"(%20753, %20754, %20755, %20756, %20757) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20758, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20759 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20760 = "torch.aten.unsqueeze"(%20758, %20759) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20760, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20761 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20762 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20763 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20764 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20765 = "torch.aten.slice.Tensor"(%20760, %20761, %20762, %20763, %20764) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20765, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20766 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20767 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20768 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20769 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20770 = "torch.prim.ListConstruct"(%20766, %20767, %20768, %20769) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20771 = "torch.aten.repeat"(%20765, %20770) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20771, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20772 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20773 = "torch.aten.unsqueeze"(%20751, %20772) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20773, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20774 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20775 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20776 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20777 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20778 = "torch.aten.slice.Tensor"(%20773, %20774, %20775, %20776, %20777) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20778, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20779 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20780 = "torch.aten.unsqueeze"(%20778, %20779) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20780, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20781 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20782 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20783 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20784 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20785 = "torch.aten.slice.Tensor"(%20780, %20781, %20782, %20783, %20784) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20785, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20786 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20787 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20788 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20789 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20790 = "torch.prim.ListConstruct"(%20786, %20787, %20788, %20789) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20791 = "torch.aten.repeat"(%20785, %20790) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20791, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20792 = "torch.aten.mul.Tensor"(%20649, %20771) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20792, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20793 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20794 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20795 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20796 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20797 = "torch.aten.slice.Tensor"(%20649, %20793, %20794, %20795, %20796) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20797, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20798 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20799 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20800 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20801 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20802 = "torch.aten.slice.Tensor"(%20649, %20798, %20799, %20800, %20801) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20802, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20803 = "torch.aten.neg"(%20802) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20803, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20804 = "torch.prim.ListConstruct"(%20803, %20797) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20805 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20806 = "torch.aten.cat"(%20804, %20805) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20806, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20807 = "torch.aten.mul.Tensor"(%20806, %20791) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20807, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20808 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20809 = "torch.aten.add.Tensor"(%20792, %20807, %20808) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20809, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20810 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20811 = "torch.constant.none"() : () -> !torch.none
    %20812 = "torch.constant.none"() : () -> !torch.none
    %20813 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20814 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20815 = "torch.aten.arange"(%20810, %20811, %20812, %20813, %20814) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %20816 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20817 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20818 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20819 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20820 = "torch.constant.none"() : () -> !torch.none
    %20821 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %20822 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20823 = "torch.aten.arange.start_step"(%20816, %20817, %20818, %20819, %20820, %20821, %20822) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %20824 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20825 = "torch.prims.convert_element_type"(%20823, %20824) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %20826 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20827 = "torch.aten.div.Scalar"(%20825, %20826) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20828 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %20829 = "torch.aten.pow.Scalar"(%20828, %20827) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20830 = "torch.aten.reciprocal"(%20829) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20831 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %20832 = "torch.aten.mul.Scalar"(%20830, %20831) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20833 = "torch.aten.reciprocal"(%20832) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20834 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %20835 = "torch.aten.mul.Scalar"(%20833, %20834) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %20836 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20837 = "torch.aten.gt.Scalar"(%20835, %20836) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20838 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20839 = "torch.aten.div.Scalar"(%20832, %20838) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20840 = "torch.aten.where.self"(%20837, %20839, %20832) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20841 = "torch.aten.reciprocal"(%20835) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20842 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %20843 = "torch.aten.mul.Scalar"(%20841, %20842) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20844 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20845 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20846 = "torch.aten.sub.Scalar"(%20843, %20844, %20845) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20847 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20848 = "torch.aten.div.Scalar"(%20846, %20847) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20849 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20850 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20851 = "torch.aten.rsub.Scalar"(%20848, %20849, %20850) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %20852 = "torch.aten.mul.Tensor"(%20851, %20840) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20853 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20854 = "torch.aten.div.Scalar"(%20852, %20853) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20855 = "torch.aten.mul.Tensor"(%20848, %20840) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20856 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20857 = "torch.aten.add.Tensor"(%20854, %20855, %20856) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %20858 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %20859 = "torch.aten.lt.Scalar"(%20835, %20858) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20860 = "torch.aten.bitwise_not"(%20859) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20861 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %20862 = "torch.aten.gt.Scalar"(%20835, %20861) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %20863 = "torch.aten.bitwise_not"(%20862) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20864 = "torch.aten.mul.Tensor"(%20860, %20863) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %20865 = "torch.aten.where.self"(%20864, %20857, %20840) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %20866 = "torch.prim.ListConstruct"(%20865, %20865) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %20867 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20868 = "torch.aten.cat"(%20866, %20867) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %20869 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20870 = "torch.prims.convert_element_type"(%20815, %20869) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %20871 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20872 = "torch.prims.convert_element_type"(%20868, %20871) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %20873 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %20874 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20875 = "torch.prim.ListConstruct"(%20873, %20874) : (!torch.int, !torch.int) -> !torch.list<int>
    %20876 = "torch.aten.view"(%20870, %20875) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %20877 = "torch.aten.mul.Tensor"(%20876, %20872) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20878 = "torch.aten.cos"(%20877) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20879 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20880 = "torch.prims.convert_element_type"(%20878, %20879) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20881 = "torch.aten.sin"(%20877) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %20882 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %20883 = "torch.prims.convert_element_type"(%20881, %20882) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %20884 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20885 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20886 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20887 = "torch.aten.slice.Tensor"(%20880, %20884, %20885, %18481, %20886) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20887, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20889 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20890 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20891 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20892 = "torch.aten.slice.Tensor"(%20887, %20888, %20889, %20890, %20891) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20892, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20894 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20895 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20896 = "torch.aten.slice.Tensor"(%20883, %20893, %20894, %18481, %20895) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20896, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20897 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20898 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20899 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20900 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20901 = "torch.aten.slice.Tensor"(%20896, %20897, %20898, %20899, %20900) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%20901, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %20902 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20903 = "torch.aten.unsqueeze"(%20892, %20902) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20903, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20904 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20905 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20906 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20907 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20908 = "torch.aten.slice.Tensor"(%20903, %20904, %20905, %20906, %20907) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20908, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20909 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20910 = "torch.aten.unsqueeze"(%20908, %20909) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20910, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20911 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20912 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20913 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20914 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20915 = "torch.aten.slice.Tensor"(%20910, %20911, %20912, %20913, %20914) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20915, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20916 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20917 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20918 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20919 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20920 = "torch.prim.ListConstruct"(%20916, %20917, %20918, %20919) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20921 = "torch.aten.repeat"(%20915, %20920) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20921, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20922 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20923 = "torch.aten.unsqueeze"(%20901, %20922) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20923, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20924 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20925 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20926 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20927 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20928 = "torch.aten.slice.Tensor"(%20923, %20924, %20925, %20926, %20927) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%20928, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %20929 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20930 = "torch.aten.unsqueeze"(%20928, %20929) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20930, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20931 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20932 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20933 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20935 = "torch.aten.slice.Tensor"(%20930, %20931, %20932, %20933, %20934) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20935, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %20936 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20937 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20938 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20939 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20940 = "torch.prim.ListConstruct"(%20936, %20937, %20938, %20939) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20941 = "torch.aten.repeat"(%20935, %20940) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%20941, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %20942 = "torch.aten.mul.Tensor"(%20654, %20921) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20942, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20943 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20944 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %20945 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20946 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20947 = "torch.aten.slice.Tensor"(%20654, %20943, %20944, %20945, %20946) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20947, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20948 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %20949 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20950 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %20951 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20952 = "torch.aten.slice.Tensor"(%20654, %20948, %20949, %20950, %20951) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20952, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20953 = "torch.aten.neg"(%20952) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20953, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %20954 = "torch.prim.ListConstruct"(%20953, %20947) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %20955 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %20956 = "torch.aten.cat"(%20954, %20955) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20956, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20957 = "torch.aten.mul.Tensor"(%20956, %20941) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20957, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20958 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20959 = "torch.aten.add.Tensor"(%20942, %20957, %20958) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20959, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20960 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %20961 = "torch.aten.mul.Scalar"(%arg69, %20960) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20961, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20962 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %20963 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %20964 = "torch.aten.add.Scalar"(%20961, %20962, %20963) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%20964, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %20965 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %20966 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20967 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20968 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20969 = "torch.prim.ListConstruct"(%20965, %18477, %20966, %20967, %20968) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20970 = "torch.aten.view"(%20959, %20969) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20970, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20971 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20972 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20973 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20974 = "torch.prim.ListConstruct"(%19011, %20971, %20972, %20973) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20975 = "torch.aten.view"(%20970, %20974) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20975, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20976 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %20977 = "torch.aten.view"(%20964, %20976) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%20977, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %20978 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20979 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20980 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20981 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20982 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20983 = "torch.prim.ListConstruct"(%18479, %20978, %20979, %20980, %20981, %20982) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20984 = "torch.aten.view"(%20386, %20983) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20984, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20985 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20986 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20987 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20988 = "torch.prim.ListConstruct"(%18993, %20985, %20986, %20987) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20989 = "torch.aten.view"(%20984, %20988) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20989, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20990 = "torch.prim.ListConstruct"(%20977) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %20991 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %20992 = "torch.aten.index_put"(%20989, %20990, %20975, %20991) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20992, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %20993 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20994 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %20995 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %20996 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %20997 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %20998 = "torch.prim.ListConstruct"(%18479, %20993, %20994, %20995, %20996, %20997) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %20999 = "torch.aten.view"(%20992, %20998) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%20999, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21000 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21001 = "torch.prim.ListConstruct"(%18479, %21000) : (!torch.int, !torch.int) -> !torch.list<int>
    %21002 = "torch.aten.view"(%20999, %21001) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21002, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21003 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21004 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21005 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21006 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21007 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21008 = "torch.prim.ListConstruct"(%18479, %21003, %21004, %21005, %21006, %21007) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21009 = "torch.aten.view"(%21002, %21008) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21009, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21010 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21011 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21012 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21013 = "torch.prim.ListConstruct"(%18993, %21010, %21011, %21012) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21014 = "torch.aten.view"(%21009, %21013) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21014, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21015 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21016 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21017 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21018 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21019 = "torch.prim.ListConstruct"(%21015, %18477, %21016, %21017, %21018) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21020 = "torch.aten.view"(%20659, %21019) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21020, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21021 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21022 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21023 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21024 = "torch.prim.ListConstruct"(%19011, %21021, %21022, %21023) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21025 = "torch.aten.view"(%21020, %21024) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21025, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21026 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21027 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21028 = "torch.aten.add.Scalar"(%20964, %21026, %21027) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21028, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21029 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %21030 = "torch.aten.view"(%21028, %21029) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%21030, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %21031 = "torch.prim.ListConstruct"(%21030) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %21032 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21033 = "torch.aten.index_put"(%21014, %21031, %21025, %21032) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21033, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21034 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21035 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21036 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21037 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21038 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21039 = "torch.prim.ListConstruct"(%18479, %21034, %21035, %21036, %21037, %21038) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21040 = "torch.aten.view"(%21033, %21039) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21040, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21041 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21042 = "torch.prim.ListConstruct"(%18479, %21041) : (!torch.int, !torch.int) -> !torch.list<int>
    %21043 = "torch.aten.view"(%21040, %21042) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21043, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21044 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21045 = "torch.aten.unsqueeze"(%20959, %21044) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21045, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21046 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21047 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21048 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21049 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21050 = "torch.prim.ListConstruct"(%21046, %18481, %21047, %21048, %21049) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21051 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21052 = "torch.aten.expand"(%21045, %21050, %21051) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21052, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21053 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21054 = "torch.aten.clone"(%21052, %21053) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21054, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21055 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21056 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21057 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21058 = "torch.prim.ListConstruct"(%21055, %18481, %21056, %21057) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21059 = "torch.aten._unsafe_view"(%21054, %21058) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21059, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21060 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21061 = "torch.aten.unsqueeze"(%20659, %21060) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21061, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21062 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21063 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21064 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21065 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21066 = "torch.prim.ListConstruct"(%21062, %18481, %21063, %21064, %21065) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21067 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21068 = "torch.aten.expand"(%21061, %21066, %21067) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21068, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21069 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21070 = "torch.aten.clone"(%21068, %21069) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21070, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21071 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21072 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21073 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21074 = "torch.prim.ListConstruct"(%21071, %18481, %21072, %21073) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21075 = "torch.aten._unsafe_view"(%21070, %21074) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21075, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21076 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21077 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21078 = "torch.aten.transpose.int"(%20809, %21076, %21077) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21078, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21079 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21080 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21081 = "torch.aten.transpose.int"(%21059, %21079, %21080) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21081, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21082 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21083 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21084 = "torch.aten.transpose.int"(%21075, %21082, %21083) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21084, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21085 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21086 = "torch.aten.squeeze.dim"(%18570, %21085) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21086, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21087 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21088 = "torch.aten.squeeze.dim"(%21086, %21087) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21088, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21089 = "torch_c.to_builtin_tensor"(%21078) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21090 = "torch_c.to_builtin_tensor"(%21081) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21091 = "torch_c.to_builtin_tensor"(%21084) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21092 = "torch_c.to_builtin_tensor"(%21088) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %21093 = "tensor.cast"(%21092) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %21094 = "torch_c.to_builtin_tensor"(%17329) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %21095 = "util.call"(%21089, %21090, %21091, %21094, %21093) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %21096 = "torch_c.from_builtin_tensor"(%21095) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%21096, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %21097 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21098 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21099 = "torch.aten.transpose.int"(%21096, %21097, %21098) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21099, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21100 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21101 = "torch.aten.clone"(%21099, %21100) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21101, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21102 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21103 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21104 = "torch.prim.ListConstruct"(%21102, %18481, %21103) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21105 = "torch.aten._unsafe_view"(%21101, %21104) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21105, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21106 = "torch.aten.div.Tensor"(%21105, %17331) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21106, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21107 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21108 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21109 = "torch.aten.clamp"(%21106, %21107, %21108) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21109, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21110 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21111 = "torch.prims.convert_element_type"(%21109, %21110) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21111, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21112 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21113 = "torch.aten.unsqueeze"(%17333, %21112) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21114 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21115 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21116 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21117 = "torch.prim.ListConstruct"(%21114, %21115, %21116) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21118 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21119 = "torch.aten.expand"(%21113, %21117, %21118) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21120 = "torch_c.to_builtin_tensor"(%21111) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21121 = "torch_c.to_builtin_tensor"(%21119) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21122 = "util.call"(%21120, %21121) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21123 = "torch_c.from_builtin_tensor"(%21122) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21123, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21124 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21125 = "torch.prims.convert_element_type"(%21123, %21124) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21125, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21126 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21127 = "torch.aten.add.Tensor"(%20553, %21125, %21126) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21127, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21128 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21129 = "torch.prims.convert_element_type"(%21127, %21128) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21129, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21130 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21131 = "torch.aten.pow.Tensor_Scalar"(%21129, %21130) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21131, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21132 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21133 = "torch.prim.ListConstruct"(%21132) : (!torch.int) -> !torch.list<int>
    %21134 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21135 = "torch.constant.none"() : () -> !torch.none
    %21136 = "torch.aten.mean.dim"(%21131, %21133, %21134, %21135) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21136, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21137 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21138 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21139 = "torch.aten.add.Scalar"(%21136, %21137, %21138) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21139, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21140 = "torch.aten.rsqrt"(%21139) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21140, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21141 = "torch.aten.mul.Tensor"(%21129, %21140) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21141, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21142 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21143 = "torch.prims.convert_element_type"(%21141, %21142) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21143, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21144 = "torch.aten.mul.Tensor"(%17335, %21143) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21144, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21145 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21146 = "torch.prims.convert_element_type"(%21144, %21145) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21146, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21147 = "torch.aten.div.Tensor"(%21146, %17337) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21147, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21148 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21149 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21150 = "torch.aten.clamp"(%21147, %21148, %21149) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21150, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21151 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21152 = "torch.prims.convert_element_type"(%21150, %21151) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21152, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21153 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21154 = "torch.aten.unsqueeze"(%17339, %21153) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21155 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21156 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21157 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21158 = "torch.prim.ListConstruct"(%21155, %21156, %21157) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21159 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21160 = "torch.aten.expand"(%21154, %21158, %21159) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21161 = "torch_c.to_builtin_tensor"(%21152) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21162 = "torch_c.to_builtin_tensor"(%21160) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21163 = "util.call"(%21161, %21162) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21164 = "torch_c.from_builtin_tensor"(%21163) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21164, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21165 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21166 = "torch.prims.convert_element_type"(%21164, %21165) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21166, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21167 = "torch.aten.silu"(%21166) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21167, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21168 = "torch.aten.div.Tensor"(%21146, %17341) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21168, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21169 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21170 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21171 = "torch.aten.clamp"(%21168, %21169, %21170) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21171, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21172 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21173 = "torch.prims.convert_element_type"(%21171, %21172) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21173, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21174 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21175 = "torch.aten.unsqueeze"(%17343, %21174) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21176 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21177 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21178 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21179 = "torch.prim.ListConstruct"(%21176, %21177, %21178) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21180 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21181 = "torch.aten.expand"(%21175, %21179, %21180) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21182 = "torch_c.to_builtin_tensor"(%21173) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21183 = "torch_c.to_builtin_tensor"(%21181) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21184 = "util.call"(%21182, %21183) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21185 = "torch_c.from_builtin_tensor"(%21184) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21185, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21186 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21187 = "torch.prims.convert_element_type"(%21185, %21186) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21187, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21188 = "torch.aten.mul.Tensor"(%21167, %21187) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21188, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21189 = "torch.aten.div.Tensor"(%21188, %17345) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21189, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21190 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21191 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21192 = "torch.aten.clamp"(%21189, %21190, %21191) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21192, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21193 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21194 = "torch.prims.convert_element_type"(%21192, %21193) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21194, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %21195 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21196 = "torch.aten.unsqueeze"(%17347, %21195) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %21197 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21198 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21199 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21200 = "torch.prim.ListConstruct"(%21197, %21198, %21199) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21201 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21202 = "torch.aten.expand"(%21196, %21200, %21201) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %21203 = "torch_c.to_builtin_tensor"(%21194) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %21204 = "torch_c.to_builtin_tensor"(%21202) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %21205 = "util.call"(%21203, %21204) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21206 = "torch_c.from_builtin_tensor"(%21205) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21206, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21207 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21208 = "torch.prims.convert_element_type"(%21206, %21207) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21208, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21210 = "torch.aten.add.Tensor"(%21127, %21208, %21209) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21210, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21211 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21212 = "torch.prims.convert_element_type"(%21210, %21211) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21212, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21213 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21214 = "torch.aten.pow.Tensor_Scalar"(%21212, %21213) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21214, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21215 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21216 = "torch.prim.ListConstruct"(%21215) : (!torch.int) -> !torch.list<int>
    %21217 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21218 = "torch.constant.none"() : () -> !torch.none
    %21219 = "torch.aten.mean.dim"(%21214, %21216, %21217, %21218) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21219, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21220 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21222 = "torch.aten.add.Scalar"(%21219, %21220, %21221) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21222, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21223 = "torch.aten.rsqrt"(%21222) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21223, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21224 = "torch.aten.mul.Tensor"(%21212, %21223) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21224, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21225 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21226 = "torch.prims.convert_element_type"(%21224, %21225) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21226, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21227 = "torch.aten.mul.Tensor"(%17349, %21226) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21227, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21228 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21229 = "torch.prims.convert_element_type"(%21227, %21228) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21229, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21230 = "torch.aten.div.Tensor"(%21229, %17351) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21230, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21231 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21232 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21233 = "torch.aten.clamp"(%21230, %21231, %21232) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21233, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21234 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21235 = "torch.prims.convert_element_type"(%21233, %21234) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21235, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21237 = "torch.aten.unsqueeze"(%17353, %21236) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21238 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21239 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21240 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21241 = "torch.prim.ListConstruct"(%21238, %21239, %21240) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21242 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21243 = "torch.aten.expand"(%21237, %21241, %21242) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21244 = "torch_c.to_builtin_tensor"(%21235) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21245 = "torch_c.to_builtin_tensor"(%21243) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21246 = "util.call"(%21244, %21245) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21247 = "torch_c.from_builtin_tensor"(%21246) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21247, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21248 = "torch.aten.div.Tensor"(%21247, %17355) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21248, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21249 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21250 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21251 = "torch.aten.clamp"(%21248, %21249, %21250) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21251, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21252 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21253 = "torch.prims.convert_element_type"(%21251, %21252) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21253, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21254 = "torch.aten.div.Tensor"(%21229, %17357) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21254, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21255 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21256 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21257 = "torch.aten.clamp"(%21254, %21255, %21256) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21257, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21258 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21259 = "torch.prims.convert_element_type"(%21257, %21258) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21259, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21260 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21261 = "torch.aten.unsqueeze"(%17359, %21260) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21262 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21263 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21264 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21265 = "torch.prim.ListConstruct"(%21262, %21263, %21264) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21266 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21267 = "torch.aten.expand"(%21261, %21265, %21266) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21268 = "torch_c.to_builtin_tensor"(%21259) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21269 = "torch_c.to_builtin_tensor"(%21267) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21270 = "util.call"(%21268, %21269) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21271 = "torch_c.from_builtin_tensor"(%21270) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21271, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21272 = "torch.aten.div.Tensor"(%21271, %17361) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21272, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21273 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21274 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21275 = "torch.aten.clamp"(%21272, %21273, %21274) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21275, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21276 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21277 = "torch.prims.convert_element_type"(%21275, %21276) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21277, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21278 = "torch.aten.div.Tensor"(%21229, %17363) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21278, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21279 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21280 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21281 = "torch.aten.clamp"(%21278, %21279, %21280) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21281, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21282 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21283 = "torch.prims.convert_element_type"(%21281, %21282) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21283, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21284 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21285 = "torch.aten.unsqueeze"(%17365, %21284) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21286 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21287 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21288 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21289 = "torch.prim.ListConstruct"(%21286, %21287, %21288) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21290 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21291 = "torch.aten.expand"(%21285, %21289, %21290) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21292 = "torch_c.to_builtin_tensor"(%21283) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21293 = "torch_c.to_builtin_tensor"(%21291) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21294 = "util.call"(%21292, %21293) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21295 = "torch_c.from_builtin_tensor"(%21294) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21295, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21296 = "torch.aten.div.Tensor"(%21295, %17367) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21296, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21297 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21298 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21299 = "torch.aten.clamp"(%21296, %21297, %21298) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21299, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21300 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21301 = "torch.prims.convert_element_type"(%21299, %21300) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21301, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21302 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21303 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21304 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21305 = "torch.prim.ListConstruct"(%21302, %18481, %21303, %21304) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21306 = "torch.aten.view"(%21253, %21305) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21306, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21307 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21308 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21309 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21310 = "torch.prim.ListConstruct"(%21307, %18481, %21308, %21309) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21311 = "torch.aten.view"(%21277, %21310) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21311, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21312 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21313 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21314 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21315 = "torch.prim.ListConstruct"(%21312, %18481, %21313, %21314) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21316 = "torch.aten.view"(%21301, %21315) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21316, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21317 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21318 = "torch.constant.none"() : () -> !torch.none
    %21319 = "torch.constant.none"() : () -> !torch.none
    %21320 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21321 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21322 = "torch.aten.arange"(%21317, %21318, %21319, %21320, %21321) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %21323 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21324 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21325 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21326 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21327 = "torch.constant.none"() : () -> !torch.none
    %21328 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21329 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21330 = "torch.aten.arange.start_step"(%21323, %21324, %21325, %21326, %21327, %21328, %21329) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %21331 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21332 = "torch.prims.convert_element_type"(%21330, %21331) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %21333 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21334 = "torch.aten.div.Scalar"(%21332, %21333) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21335 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %21336 = "torch.aten.pow.Scalar"(%21335, %21334) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21337 = "torch.aten.reciprocal"(%21336) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21338 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %21339 = "torch.aten.mul.Scalar"(%21337, %21338) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21340 = "torch.aten.reciprocal"(%21339) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21341 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %21342 = "torch.aten.mul.Scalar"(%21340, %21341) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21343 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21344 = "torch.aten.gt.Scalar"(%21342, %21343) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21345 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21346 = "torch.aten.div.Scalar"(%21339, %21345) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21347 = "torch.aten.where.self"(%21344, %21346, %21339) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21348 = "torch.aten.reciprocal"(%21342) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21349 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %21350 = "torch.aten.mul.Scalar"(%21348, %21349) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21351 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21352 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21353 = "torch.aten.sub.Scalar"(%21350, %21351, %21352) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21354 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21355 = "torch.aten.div.Scalar"(%21353, %21354) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21356 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21357 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21358 = "torch.aten.rsub.Scalar"(%21355, %21356, %21357) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21359 = "torch.aten.mul.Tensor"(%21358, %21347) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21360 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21361 = "torch.aten.div.Scalar"(%21359, %21360) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21362 = "torch.aten.mul.Tensor"(%21355, %21347) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21363 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21364 = "torch.aten.add.Tensor"(%21361, %21362, %21363) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21365 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %21366 = "torch.aten.lt.Scalar"(%21342, %21365) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21367 = "torch.aten.bitwise_not"(%21366) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21368 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21369 = "torch.aten.gt.Scalar"(%21342, %21368) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21370 = "torch.aten.bitwise_not"(%21369) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21371 = "torch.aten.mul.Tensor"(%21367, %21370) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21372 = "torch.aten.where.self"(%21371, %21364, %21347) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21373 = "torch.prim.ListConstruct"(%21372, %21372) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %21374 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21375 = "torch.aten.cat"(%21373, %21374) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %21376 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21377 = "torch.prims.convert_element_type"(%21322, %21376) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %21378 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21379 = "torch.prims.convert_element_type"(%21375, %21378) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %21380 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21381 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21382 = "torch.prim.ListConstruct"(%21380, %21381) : (!torch.int, !torch.int) -> !torch.list<int>
    %21383 = "torch.aten.view"(%21377, %21382) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %21384 = "torch.aten.mul.Tensor"(%21383, %21379) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21385 = "torch.aten.cos"(%21384) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21386 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21387 = "torch.prims.convert_element_type"(%21385, %21386) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21388 = "torch.aten.sin"(%21384) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21389 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21390 = "torch.prims.convert_element_type"(%21388, %21389) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21391 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21392 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21393 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21394 = "torch.aten.slice.Tensor"(%21387, %21391, %21392, %18481, %21393) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21394, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21395 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21396 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21397 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21398 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21399 = "torch.aten.slice.Tensor"(%21394, %21395, %21396, %21397, %21398) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21399, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21400 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21401 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21402 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21403 = "torch.aten.slice.Tensor"(%21390, %21400, %21401, %18481, %21402) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21403, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21404 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21405 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21406 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21407 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21408 = "torch.aten.slice.Tensor"(%21403, %21404, %21405, %21406, %21407) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21408, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21409 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21410 = "torch.aten.unsqueeze"(%21399, %21409) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21410, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21411 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21412 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21413 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21414 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21415 = "torch.aten.slice.Tensor"(%21410, %21411, %21412, %21413, %21414) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21415, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21416 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21417 = "torch.aten.unsqueeze"(%21415, %21416) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21417, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21418 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21419 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21420 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21421 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21422 = "torch.aten.slice.Tensor"(%21417, %21418, %21419, %21420, %21421) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21422, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21423 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21424 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21425 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21426 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21427 = "torch.prim.ListConstruct"(%21423, %21424, %21425, %21426) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21428 = "torch.aten.repeat"(%21422, %21427) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21428, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21429 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21430 = "torch.aten.unsqueeze"(%21408, %21429) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21430, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21431 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21432 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21433 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21434 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21435 = "torch.aten.slice.Tensor"(%21430, %21431, %21432, %21433, %21434) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21435, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21436 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21437 = "torch.aten.unsqueeze"(%21435, %21436) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21437, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21438 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21439 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21440 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21441 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21442 = "torch.aten.slice.Tensor"(%21437, %21438, %21439, %21440, %21441) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21442, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21443 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21444 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21445 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21446 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21447 = "torch.prim.ListConstruct"(%21443, %21444, %21445, %21446) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21448 = "torch.aten.repeat"(%21442, %21447) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21448, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21449 = "torch.aten.mul.Tensor"(%21306, %21428) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21449, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21450 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21451 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21452 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21453 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21454 = "torch.aten.slice.Tensor"(%21306, %21450, %21451, %21452, %21453) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21454, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21455 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21456 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21457 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21458 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21459 = "torch.aten.slice.Tensor"(%21306, %21455, %21456, %21457, %21458) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21459, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21460 = "torch.aten.neg"(%21459) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21460, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21461 = "torch.prim.ListConstruct"(%21460, %21454) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %21462 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21463 = "torch.aten.cat"(%21461, %21462) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21463, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21464 = "torch.aten.mul.Tensor"(%21463, %21448) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21464, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21465 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21466 = "torch.aten.add.Tensor"(%21449, %21464, %21465) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21466, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21467 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21468 = "torch.constant.none"() : () -> !torch.none
    %21469 = "torch.constant.none"() : () -> !torch.none
    %21470 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21471 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21472 = "torch.aten.arange"(%21467, %21468, %21469, %21470, %21471) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %21473 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21474 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21475 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21476 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21477 = "torch.constant.none"() : () -> !torch.none
    %21478 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21479 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21480 = "torch.aten.arange.start_step"(%21473, %21474, %21475, %21476, %21477, %21478, %21479) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %21481 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21482 = "torch.prims.convert_element_type"(%21480, %21481) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %21483 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21484 = "torch.aten.div.Scalar"(%21482, %21483) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21485 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %21486 = "torch.aten.pow.Scalar"(%21485, %21484) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21487 = "torch.aten.reciprocal"(%21486) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21488 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %21489 = "torch.aten.mul.Scalar"(%21487, %21488) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21490 = "torch.aten.reciprocal"(%21489) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21491 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %21492 = "torch.aten.mul.Scalar"(%21490, %21491) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21493 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21494 = "torch.aten.gt.Scalar"(%21492, %21493) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21495 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21496 = "torch.aten.div.Scalar"(%21489, %21495) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21497 = "torch.aten.where.self"(%21494, %21496, %21489) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21498 = "torch.aten.reciprocal"(%21492) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21499 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %21500 = "torch.aten.mul.Scalar"(%21498, %21499) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21501 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21502 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21503 = "torch.aten.sub.Scalar"(%21500, %21501, %21502) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21504 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21505 = "torch.aten.div.Scalar"(%21503, %21504) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21506 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21507 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21508 = "torch.aten.rsub.Scalar"(%21505, %21506, %21507) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %21509 = "torch.aten.mul.Tensor"(%21508, %21497) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21510 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21511 = "torch.aten.div.Scalar"(%21509, %21510) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21512 = "torch.aten.mul.Tensor"(%21505, %21497) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21513 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21514 = "torch.aten.add.Tensor"(%21511, %21512, %21513) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21515 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %21516 = "torch.aten.lt.Scalar"(%21492, %21515) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21517 = "torch.aten.bitwise_not"(%21516) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21518 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %21519 = "torch.aten.gt.Scalar"(%21492, %21518) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %21520 = "torch.aten.bitwise_not"(%21519) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21521 = "torch.aten.mul.Tensor"(%21517, %21520) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %21522 = "torch.aten.where.self"(%21521, %21514, %21497) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21523 = "torch.prim.ListConstruct"(%21522, %21522) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %21524 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21525 = "torch.aten.cat"(%21523, %21524) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %21526 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21527 = "torch.prims.convert_element_type"(%21472, %21526) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %21528 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21529 = "torch.prims.convert_element_type"(%21525, %21528) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %21530 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21531 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21532 = "torch.prim.ListConstruct"(%21530, %21531) : (!torch.int, !torch.int) -> !torch.list<int>
    %21533 = "torch.aten.view"(%21527, %21532) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %21534 = "torch.aten.mul.Tensor"(%21533, %21529) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21535 = "torch.aten.cos"(%21534) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21536 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21537 = "torch.prims.convert_element_type"(%21535, %21536) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21538 = "torch.aten.sin"(%21534) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %21539 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21540 = "torch.prims.convert_element_type"(%21538, %21539) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %21541 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21542 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21543 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21544 = "torch.aten.slice.Tensor"(%21537, %21541, %21542, %18481, %21543) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21544, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21545 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21546 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21547 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21548 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21549 = "torch.aten.slice.Tensor"(%21544, %21545, %21546, %21547, %21548) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21549, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21550 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21551 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21552 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21553 = "torch.aten.slice.Tensor"(%21540, %21550, %21551, %18481, %21552) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21553, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21554 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21555 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21556 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21557 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21558 = "torch.aten.slice.Tensor"(%21553, %21554, %21555, %21556, %21557) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%21558, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %21559 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21560 = "torch.aten.unsqueeze"(%21549, %21559) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21560, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21561 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21562 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21563 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21564 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21565 = "torch.aten.slice.Tensor"(%21560, %21561, %21562, %21563, %21564) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21565, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21566 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21567 = "torch.aten.unsqueeze"(%21565, %21566) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21567, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21568 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21569 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21570 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21571 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21572 = "torch.aten.slice.Tensor"(%21567, %21568, %21569, %21570, %21571) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21572, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21573 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21574 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21575 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21576 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21577 = "torch.prim.ListConstruct"(%21573, %21574, %21575, %21576) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21578 = "torch.aten.repeat"(%21572, %21577) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21578, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21579 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21580 = "torch.aten.unsqueeze"(%21558, %21579) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21580, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21581 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21582 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21583 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21584 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21585 = "torch.aten.slice.Tensor"(%21580, %21581, %21582, %21583, %21584) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%21585, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %21586 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21587 = "torch.aten.unsqueeze"(%21585, %21586) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21587, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21588 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21589 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21590 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21591 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21592 = "torch.aten.slice.Tensor"(%21587, %21588, %21589, %21590, %21591) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21592, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %21593 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21594 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21595 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21596 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21597 = "torch.prim.ListConstruct"(%21593, %21594, %21595, %21596) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21598 = "torch.aten.repeat"(%21592, %21597) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%21598, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %21599 = "torch.aten.mul.Tensor"(%21311, %21578) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21599, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21600 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21601 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21602 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21603 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21604 = "torch.aten.slice.Tensor"(%21311, %21600, %21601, %21602, %21603) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21604, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21605 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %21606 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21607 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %21608 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21609 = "torch.aten.slice.Tensor"(%21311, %21605, %21606, %21607, %21608) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21609, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21610 = "torch.aten.neg"(%21609) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21610, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %21611 = "torch.prim.ListConstruct"(%21610, %21604) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %21612 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21613 = "torch.aten.cat"(%21611, %21612) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21613, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21614 = "torch.aten.mul.Tensor"(%21613, %21598) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21614, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21615 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21616 = "torch.aten.add.Tensor"(%21599, %21614, %21615) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21616, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21617 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %21618 = "torch.aten.mul.Scalar"(%arg69, %21617) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21618, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21619 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21620 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21621 = "torch.aten.add.Scalar"(%21618, %21619, %21620) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21621, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21622 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21623 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21624 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21625 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21626 = "torch.prim.ListConstruct"(%21622, %18477, %21623, %21624, %21625) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21627 = "torch.aten.view"(%21616, %21626) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21627, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21628 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21629 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21630 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21631 = "torch.prim.ListConstruct"(%19011, %21628, %21629, %21630) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21632 = "torch.aten.view"(%21627, %21631) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21632, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21633 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %21634 = "torch.aten.view"(%21621, %21633) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%21634, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %21635 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21636 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21637 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21638 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21639 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21640 = "torch.prim.ListConstruct"(%18479, %21635, %21636, %21637, %21638, %21639) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21641 = "torch.aten.view"(%21043, %21640) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21641, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21642 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21643 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21644 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21645 = "torch.prim.ListConstruct"(%18993, %21642, %21643, %21644) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21646 = "torch.aten.view"(%21641, %21645) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21646, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21647 = "torch.prim.ListConstruct"(%21634) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %21648 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21649 = "torch.aten.index_put"(%21646, %21647, %21632, %21648) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21649, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21650 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21651 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21652 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21653 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21654 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21655 = "torch.prim.ListConstruct"(%18479, %21650, %21651, %21652, %21653, %21654) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21656 = "torch.aten.view"(%21649, %21655) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21656, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21657 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21658 = "torch.prim.ListConstruct"(%18479, %21657) : (!torch.int, !torch.int) -> !torch.list<int>
    %21659 = "torch.aten.view"(%21656, %21658) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21659, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21660 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21661 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21662 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21663 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21664 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21665 = "torch.prim.ListConstruct"(%18479, %21660, %21661, %21662, %21663, %21664) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21666 = "torch.aten.view"(%21659, %21665) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21666, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21667 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21668 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21669 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21670 = "torch.prim.ListConstruct"(%18993, %21667, %21668, %21669) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21671 = "torch.aten.view"(%21666, %21670) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21671, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21672 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21673 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21674 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21675 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21676 = "torch.prim.ListConstruct"(%21672, %18477, %21673, %21674, %21675) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21677 = "torch.aten.view"(%21316, %21676) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21677, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21678 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21679 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21680 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21681 = "torch.prim.ListConstruct"(%19011, %21678, %21679, %21680) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21682 = "torch.aten.view"(%21677, %21681) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21682, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21683 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21684 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21685 = "torch.aten.add.Scalar"(%21621, %21683, %21684) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%21685, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %21686 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %21687 = "torch.aten.view"(%21685, %21686) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%21687, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %21688 = "torch.prim.ListConstruct"(%21687) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %21689 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21690 = "torch.aten.index_put"(%21671, %21688, %21682, %21689) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21690, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21691 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21692 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21693 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21694 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21695 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21696 = "torch.prim.ListConstruct"(%18479, %21691, %21692, %21693, %21694, %21695) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21697 = "torch.aten.view"(%21690, %21696) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21697, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21698 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %21699 = "torch.prim.ListConstruct"(%18479, %21698) : (!torch.int, !torch.int) -> !torch.list<int>
    %21700 = "torch.aten.view"(%21697, %21699) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21700, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %21701 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21702 = "torch.aten.unsqueeze"(%21616, %21701) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21702, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21703 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21704 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21705 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21706 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21707 = "torch.prim.ListConstruct"(%21703, %18481, %21704, %21705, %21706) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21708 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21709 = "torch.aten.expand"(%21702, %21707, %21708) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21709, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21710 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21711 = "torch.aten.clone"(%21709, %21710) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21711, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21712 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21713 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21714 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21715 = "torch.prim.ListConstruct"(%21712, %18481, %21713, %21714) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21716 = "torch.aten._unsafe_view"(%21711, %21715) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21716, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21717 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %21718 = "torch.aten.unsqueeze"(%21316, %21717) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21718, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21719 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21720 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21721 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21722 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21723 = "torch.prim.ListConstruct"(%21719, %18481, %21720, %21721, %21722) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21724 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21725 = "torch.aten.expand"(%21718, %21723, %21724) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21725, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21726 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21727 = "torch.aten.clone"(%21725, %21726) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21727, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21728 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21729 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21730 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21731 = "torch.prim.ListConstruct"(%21728, %18481, %21729, %21730) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21732 = "torch.aten._unsafe_view"(%21727, %21731) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21732, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21733 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21734 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21735 = "torch.aten.transpose.int"(%21466, %21733, %21734) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21735, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21736 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21737 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21738 = "torch.aten.transpose.int"(%21716, %21736, %21737) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21738, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21739 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21740 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21741 = "torch.aten.transpose.int"(%21732, %21739, %21740) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21741, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21742 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21743 = "torch.aten.squeeze.dim"(%18570, %21742) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21743, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21744 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21745 = "torch.aten.squeeze.dim"(%21743, %21744) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21745, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %21746 = "torch_c.to_builtin_tensor"(%21735) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21747 = "torch_c.to_builtin_tensor"(%21738) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21748 = "torch_c.to_builtin_tensor"(%21741) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %21749 = "torch_c.to_builtin_tensor"(%21745) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %21750 = "tensor.cast"(%21749) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %21751 = "torch_c.to_builtin_tensor"(%17369) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %21752 = "util.call"(%21746, %21747, %21748, %21751, %21750) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %21753 = "torch_c.from_builtin_tensor"(%21752) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%21753, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %21754 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21755 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21756 = "torch.aten.transpose.int"(%21753, %21754, %21755) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21756, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21757 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21758 = "torch.aten.clone"(%21756, %21757) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%21758, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %21759 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21760 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21761 = "torch.prim.ListConstruct"(%21759, %18481, %21760) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21762 = "torch.aten._unsafe_view"(%21758, %21761) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21762, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21763 = "torch.aten.div.Tensor"(%21762, %17371) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21763, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21764 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21765 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21766 = "torch.aten.clamp"(%21763, %21764, %21765) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21766, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21767 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21768 = "torch.prims.convert_element_type"(%21766, %21767) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21768, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21769 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21770 = "torch.aten.unsqueeze"(%17373, %21769) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21771 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21772 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21773 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21774 = "torch.prim.ListConstruct"(%21771, %21772, %21773) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21775 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21776 = "torch.aten.expand"(%21770, %21774, %21775) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21777 = "torch_c.to_builtin_tensor"(%21768) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21778 = "torch_c.to_builtin_tensor"(%21776) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21779 = "util.call"(%21777, %21778) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21780 = "torch_c.from_builtin_tensor"(%21779) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21780, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21781 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21782 = "torch.prims.convert_element_type"(%21780, %21781) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21782, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21783 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21784 = "torch.aten.add.Tensor"(%21210, %21782, %21783) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21784, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21785 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21786 = "torch.prims.convert_element_type"(%21784, %21785) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21786, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21787 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21788 = "torch.aten.pow.Tensor_Scalar"(%21786, %21787) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21788, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21789 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21790 = "torch.prim.ListConstruct"(%21789) : (!torch.int) -> !torch.list<int>
    %21791 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21792 = "torch.constant.none"() : () -> !torch.none
    %21793 = "torch.aten.mean.dim"(%21788, %21790, %21791, %21792) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21793, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21794 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21795 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21796 = "torch.aten.add.Scalar"(%21793, %21794, %21795) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21796, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21797 = "torch.aten.rsqrt"(%21796) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21797, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21798 = "torch.aten.mul.Tensor"(%21786, %21797) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21798, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21799 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21800 = "torch.prims.convert_element_type"(%21798, %21799) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21800, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21801 = "torch.aten.mul.Tensor"(%17375, %21800) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21801, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21802 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21803 = "torch.prims.convert_element_type"(%21801, %21802) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21803, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21804 = "torch.aten.div.Tensor"(%21803, %17377) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21804, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21805 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21806 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21807 = "torch.aten.clamp"(%21804, %21805, %21806) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21807, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21808 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21809 = "torch.prims.convert_element_type"(%21807, %21808) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21809, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21810 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21811 = "torch.aten.unsqueeze"(%17379, %21810) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21812 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21813 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21814 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21815 = "torch.prim.ListConstruct"(%21812, %21813, %21814) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21816 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21817 = "torch.aten.expand"(%21811, %21815, %21816) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21818 = "torch_c.to_builtin_tensor"(%21809) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21819 = "torch_c.to_builtin_tensor"(%21817) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21820 = "util.call"(%21818, %21819) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21821 = "torch_c.from_builtin_tensor"(%21820) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21821, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21822 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21823 = "torch.prims.convert_element_type"(%21821, %21822) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21823, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21824 = "torch.aten.silu"(%21823) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21824, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21825 = "torch.aten.div.Tensor"(%21803, %17381) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21825, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21826 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21827 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21828 = "torch.aten.clamp"(%21825, %21826, %21827) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21828, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21829 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21830 = "torch.prims.convert_element_type"(%21828, %21829) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21830, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21831 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21832 = "torch.aten.unsqueeze"(%17383, %21831) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %21833 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21834 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21835 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21836 = "torch.prim.ListConstruct"(%21833, %21834, %21835) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21837 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21838 = "torch.aten.expand"(%21832, %21836, %21837) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %21839 = "torch_c.to_builtin_tensor"(%21830) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21840 = "torch_c.to_builtin_tensor"(%21838) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %21841 = "util.call"(%21839, %21840) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %21842 = "torch_c.from_builtin_tensor"(%21841) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%21842, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %21843 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21844 = "torch.prims.convert_element_type"(%21842, %21843) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21844, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21845 = "torch.aten.mul.Tensor"(%21824, %21844) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21845, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21846 = "torch.aten.div.Tensor"(%21845, %17385) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21846, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21847 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21848 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21849 = "torch.aten.clamp"(%21846, %21847, %21848) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%21849, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %21850 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21851 = "torch.prims.convert_element_type"(%21849, %21850) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21851, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %21852 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21853 = "torch.aten.unsqueeze"(%17387, %21852) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %21854 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21855 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21856 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %21857 = "torch.prim.ListConstruct"(%21854, %21855, %21856) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21858 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21859 = "torch.aten.expand"(%21853, %21857, %21858) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %21860 = "torch_c.to_builtin_tensor"(%21851) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %21861 = "torch_c.to_builtin_tensor"(%21859) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %21862 = "util.call"(%21860, %21861) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21863 = "torch_c.from_builtin_tensor"(%21862) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21863, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21864 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21865 = "torch.prims.convert_element_type"(%21863, %21864) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21865, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21867 = "torch.aten.add.Tensor"(%21784, %21865, %21866) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21867, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21868 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21869 = "torch.prims.convert_element_type"(%21867, %21868) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21869, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21870 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21871 = "torch.aten.pow.Tensor_Scalar"(%21869, %21870) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21871, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21872 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %21873 = "torch.prim.ListConstruct"(%21872) : (!torch.int) -> !torch.list<int>
    %21874 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %21875 = "torch.constant.none"() : () -> !torch.none
    %21876 = "torch.aten.mean.dim"(%21871, %21873, %21874, %21875) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21876, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21877 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %21878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %21879 = "torch.aten.add.Scalar"(%21876, %21877, %21878) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21879, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21880 = "torch.aten.rsqrt"(%21879) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%21880, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %21881 = "torch.aten.mul.Tensor"(%21869, %21880) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21881, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21882 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21883 = "torch.prims.convert_element_type"(%21881, %21882) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21883, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21884 = "torch.aten.mul.Tensor"(%17389, %21883) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21884, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21885 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %21886 = "torch.prims.convert_element_type"(%21884, %21885) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21886, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21887 = "torch.aten.div.Tensor"(%21886, %17391) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21887, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21888 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21889 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21890 = "torch.aten.clamp"(%21887, %21888, %21889) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21890, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21891 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21892 = "torch.prims.convert_element_type"(%21890, %21891) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21892, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21894 = "torch.aten.unsqueeze"(%17393, %21893) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %21895 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21896 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21897 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21898 = "torch.prim.ListConstruct"(%21895, %21896, %21897) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21899 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21900 = "torch.aten.expand"(%21894, %21898, %21899) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %21901 = "torch_c.to_builtin_tensor"(%21892) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21902 = "torch_c.to_builtin_tensor"(%21900) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %21903 = "util.call"(%21901, %21902) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %21904 = "torch_c.from_builtin_tensor"(%21903) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21904, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21905 = "torch.aten.div.Tensor"(%21904, %17395) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21905, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21906 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21907 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21908 = "torch.aten.clamp"(%21905, %21906, %21907) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%21908, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %21909 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21910 = "torch.prims.convert_element_type"(%21908, %21909) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21910, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21911 = "torch.aten.div.Tensor"(%21886, %17397) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21911, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21912 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21913 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21914 = "torch.aten.clamp"(%21911, %21912, %21913) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21914, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21915 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21916 = "torch.prims.convert_element_type"(%21914, %21915) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21916, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21917 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21918 = "torch.aten.unsqueeze"(%17399, %21917) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21919 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21920 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21921 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21922 = "torch.prim.ListConstruct"(%21919, %21920, %21921) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21923 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21924 = "torch.aten.expand"(%21918, %21922, %21923) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21925 = "torch_c.to_builtin_tensor"(%21916) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21926 = "torch_c.to_builtin_tensor"(%21924) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21927 = "util.call"(%21925, %21926) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21928 = "torch_c.from_builtin_tensor"(%21927) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21928, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21929 = "torch.aten.div.Tensor"(%21928, %17401) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21929, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21930 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21931 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21932 = "torch.aten.clamp"(%21929, %21930, %21931) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21932, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21933 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21934 = "torch.prims.convert_element_type"(%21932, %21933) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21934, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21935 = "torch.aten.div.Tensor"(%21886, %17403) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21935, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21936 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21937 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21938 = "torch.aten.clamp"(%21935, %21936, %21937) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%21938, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %21939 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21940 = "torch.prims.convert_element_type"(%21938, %21939) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21940, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %21941 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21942 = "torch.aten.unsqueeze"(%17405, %21941) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %21943 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21944 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %21945 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %21946 = "torch.prim.ListConstruct"(%21943, %21944, %21945) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21947 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21948 = "torch.aten.expand"(%21942, %21946, %21947) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %21949 = "torch_c.to_builtin_tensor"(%21940) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %21950 = "torch_c.to_builtin_tensor"(%21948) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %21951 = "util.call"(%21949, %21950) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %21952 = "torch_c.from_builtin_tensor"(%21951) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21952, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21953 = "torch.aten.div.Tensor"(%21952, %17407) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21953, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21954 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %21955 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %21956 = "torch.aten.clamp"(%21953, %21954, %21955) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%21956, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %21957 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %21958 = "torch.prims.convert_element_type"(%21956, %21957) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21958, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %21959 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21960 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %21961 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21962 = "torch.prim.ListConstruct"(%21959, %18481, %21960, %21961) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21963 = "torch.aten.view"(%21910, %21962) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21963, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21964 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21965 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21966 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21967 = "torch.prim.ListConstruct"(%21964, %18481, %21965, %21966) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21968 = "torch.aten.view"(%21934, %21967) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21968, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21969 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21970 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %21971 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21972 = "torch.prim.ListConstruct"(%21969, %18481, %21970, %21971) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %21973 = "torch.aten.view"(%21958, %21972) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%21973, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %21974 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %21975 = "torch.constant.none"() : () -> !torch.none
    %21976 = "torch.constant.none"() : () -> !torch.none
    %21977 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21978 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21979 = "torch.aten.arange"(%21974, %21975, %21976, %21977, %21978) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %21980 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %21981 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21982 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %21983 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %21984 = "torch.constant.none"() : () -> !torch.none
    %21985 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %21986 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %21987 = "torch.aten.arange.start_step"(%21980, %21981, %21982, %21983, %21984, %21985, %21986) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %21988 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %21989 = "torch.prims.convert_element_type"(%21987, %21988) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %21990 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %21991 = "torch.aten.div.Scalar"(%21989, %21990) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %21992 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %21993 = "torch.aten.pow.Scalar"(%21992, %21991) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21994 = "torch.aten.reciprocal"(%21993) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21995 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %21996 = "torch.aten.mul.Scalar"(%21994, %21995) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %21997 = "torch.aten.reciprocal"(%21996) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %21998 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %21999 = "torch.aten.mul.Scalar"(%21997, %21998) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22000 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22001 = "torch.aten.gt.Scalar"(%21999, %22000) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22002 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22003 = "torch.aten.div.Scalar"(%21996, %22002) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22004 = "torch.aten.where.self"(%22001, %22003, %21996) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22005 = "torch.aten.reciprocal"(%21999) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22006 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22007 = "torch.aten.mul.Scalar"(%22005, %22006) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22008 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22009 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22010 = "torch.aten.sub.Scalar"(%22007, %22008, %22009) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22011 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22012 = "torch.aten.div.Scalar"(%22010, %22011) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22013 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22014 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22015 = "torch.aten.rsub.Scalar"(%22012, %22013, %22014) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22016 = "torch.aten.mul.Tensor"(%22015, %22004) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22017 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22018 = "torch.aten.div.Scalar"(%22016, %22017) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22019 = "torch.aten.mul.Tensor"(%22012, %22004) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22020 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22021 = "torch.aten.add.Tensor"(%22018, %22019, %22020) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22022 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22023 = "torch.aten.lt.Scalar"(%21999, %22022) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22024 = "torch.aten.bitwise_not"(%22023) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22025 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22026 = "torch.aten.gt.Scalar"(%21999, %22025) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22027 = "torch.aten.bitwise_not"(%22026) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22028 = "torch.aten.mul.Tensor"(%22024, %22027) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22029 = "torch.aten.where.self"(%22028, %22021, %22004) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22030 = "torch.prim.ListConstruct"(%22029, %22029) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22031 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22032 = "torch.aten.cat"(%22030, %22031) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22033 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22034 = "torch.prims.convert_element_type"(%21979, %22033) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22035 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22036 = "torch.prims.convert_element_type"(%22032, %22035) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22037 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22038 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22039 = "torch.prim.ListConstruct"(%22037, %22038) : (!torch.int, !torch.int) -> !torch.list<int>
    %22040 = "torch.aten.view"(%22034, %22039) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22041 = "torch.aten.mul.Tensor"(%22040, %22036) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22042 = "torch.aten.cos"(%22041) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22043 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22044 = "torch.prims.convert_element_type"(%22042, %22043) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22045 = "torch.aten.sin"(%22041) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22046 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22047 = "torch.prims.convert_element_type"(%22045, %22046) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22048 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22049 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22050 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22051 = "torch.aten.slice.Tensor"(%22044, %22048, %22049, %18481, %22050) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22051, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22052 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22053 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22054 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22055 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22056 = "torch.aten.slice.Tensor"(%22051, %22052, %22053, %22054, %22055) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22056, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22057 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22058 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22059 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22060 = "torch.aten.slice.Tensor"(%22047, %22057, %22058, %18481, %22059) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22060, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22061 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22062 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22063 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22064 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22065 = "torch.aten.slice.Tensor"(%22060, %22061, %22062, %22063, %22064) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22065, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22066 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22067 = "torch.aten.unsqueeze"(%22056, %22066) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22067, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22068 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22069 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22070 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22071 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22072 = "torch.aten.slice.Tensor"(%22067, %22068, %22069, %22070, %22071) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22072, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22073 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22074 = "torch.aten.unsqueeze"(%22072, %22073) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22074, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22075 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22076 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22077 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22078 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22079 = "torch.aten.slice.Tensor"(%22074, %22075, %22076, %22077, %22078) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22079, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22080 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22081 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22082 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22083 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22084 = "torch.prim.ListConstruct"(%22080, %22081, %22082, %22083) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22085 = "torch.aten.repeat"(%22079, %22084) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22085, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22086 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22087 = "torch.aten.unsqueeze"(%22065, %22086) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22087, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22088 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22089 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22090 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22091 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22092 = "torch.aten.slice.Tensor"(%22087, %22088, %22089, %22090, %22091) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22092, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22093 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22094 = "torch.aten.unsqueeze"(%22092, %22093) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22094, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22095 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22096 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22097 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22098 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22099 = "torch.aten.slice.Tensor"(%22094, %22095, %22096, %22097, %22098) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22099, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22100 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22101 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22102 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22103 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22104 = "torch.prim.ListConstruct"(%22100, %22101, %22102, %22103) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22105 = "torch.aten.repeat"(%22099, %22104) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22105, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22106 = "torch.aten.mul.Tensor"(%21963, %22085) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22106, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22107 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22108 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22109 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22110 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22111 = "torch.aten.slice.Tensor"(%21963, %22107, %22108, %22109, %22110) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22111, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22112 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22113 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22114 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22115 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22116 = "torch.aten.slice.Tensor"(%21963, %22112, %22113, %22114, %22115) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22116, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22117 = "torch.aten.neg"(%22116) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22117, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22118 = "torch.prim.ListConstruct"(%22117, %22111) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22119 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22120 = "torch.aten.cat"(%22118, %22119) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22120, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22121 = "torch.aten.mul.Tensor"(%22120, %22105) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22121, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22122 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22123 = "torch.aten.add.Tensor"(%22106, %22121, %22122) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22123, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22124 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22125 = "torch.constant.none"() : () -> !torch.none
    %22126 = "torch.constant.none"() : () -> !torch.none
    %22127 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22128 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22129 = "torch.aten.arange"(%22124, %22125, %22126, %22127, %22128) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %22130 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22131 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22132 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22133 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22134 = "torch.constant.none"() : () -> !torch.none
    %22135 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22136 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22137 = "torch.aten.arange.start_step"(%22130, %22131, %22132, %22133, %22134, %22135, %22136) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %22138 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22139 = "torch.prims.convert_element_type"(%22137, %22138) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %22140 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22141 = "torch.aten.div.Scalar"(%22139, %22140) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22142 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %22143 = "torch.aten.pow.Scalar"(%22142, %22141) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22144 = "torch.aten.reciprocal"(%22143) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22145 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %22146 = "torch.aten.mul.Scalar"(%22144, %22145) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22147 = "torch.aten.reciprocal"(%22146) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22148 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %22149 = "torch.aten.mul.Scalar"(%22147, %22148) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22150 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22151 = "torch.aten.gt.Scalar"(%22149, %22150) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22152 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22153 = "torch.aten.div.Scalar"(%22146, %22152) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22154 = "torch.aten.where.self"(%22151, %22153, %22146) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22155 = "torch.aten.reciprocal"(%22149) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22156 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22157 = "torch.aten.mul.Scalar"(%22155, %22156) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22158 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22159 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22160 = "torch.aten.sub.Scalar"(%22157, %22158, %22159) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22161 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22162 = "torch.aten.div.Scalar"(%22160, %22161) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22163 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22164 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22165 = "torch.aten.rsub.Scalar"(%22162, %22163, %22164) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22166 = "torch.aten.mul.Tensor"(%22165, %22154) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22167 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22168 = "torch.aten.div.Scalar"(%22166, %22167) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22169 = "torch.aten.mul.Tensor"(%22162, %22154) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22170 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22171 = "torch.aten.add.Tensor"(%22168, %22169, %22170) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22172 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22173 = "torch.aten.lt.Scalar"(%22149, %22172) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22174 = "torch.aten.bitwise_not"(%22173) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22175 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22176 = "torch.aten.gt.Scalar"(%22149, %22175) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22177 = "torch.aten.bitwise_not"(%22176) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22178 = "torch.aten.mul.Tensor"(%22174, %22177) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22179 = "torch.aten.where.self"(%22178, %22171, %22154) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22180 = "torch.prim.ListConstruct"(%22179, %22179) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22181 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22182 = "torch.aten.cat"(%22180, %22181) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22183 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22184 = "torch.prims.convert_element_type"(%22129, %22183) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22185 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22186 = "torch.prims.convert_element_type"(%22182, %22185) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22187 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22188 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22189 = "torch.prim.ListConstruct"(%22187, %22188) : (!torch.int, !torch.int) -> !torch.list<int>
    %22190 = "torch.aten.view"(%22184, %22189) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22191 = "torch.aten.mul.Tensor"(%22190, %22186) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22192 = "torch.aten.cos"(%22191) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22193 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22194 = "torch.prims.convert_element_type"(%22192, %22193) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22195 = "torch.aten.sin"(%22191) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22196 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22197 = "torch.prims.convert_element_type"(%22195, %22196) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22198 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22199 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22200 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22201 = "torch.aten.slice.Tensor"(%22194, %22198, %22199, %18481, %22200) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22201, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22202 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22203 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22204 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22205 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22206 = "torch.aten.slice.Tensor"(%22201, %22202, %22203, %22204, %22205) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22206, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22207 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22208 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22209 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22210 = "torch.aten.slice.Tensor"(%22197, %22207, %22208, %18481, %22209) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22210, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22211 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22212 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22213 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22214 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22215 = "torch.aten.slice.Tensor"(%22210, %22211, %22212, %22213, %22214) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22215, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22216 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22217 = "torch.aten.unsqueeze"(%22206, %22216) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22217, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22218 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22219 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22220 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22221 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22222 = "torch.aten.slice.Tensor"(%22217, %22218, %22219, %22220, %22221) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22222, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22223 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22224 = "torch.aten.unsqueeze"(%22222, %22223) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22224, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22225 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22226 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22227 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22228 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22229 = "torch.aten.slice.Tensor"(%22224, %22225, %22226, %22227, %22228) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22229, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22230 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22231 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22232 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22233 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22234 = "torch.prim.ListConstruct"(%22230, %22231, %22232, %22233) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22235 = "torch.aten.repeat"(%22229, %22234) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22235, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22236 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22237 = "torch.aten.unsqueeze"(%22215, %22236) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22237, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22238 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22239 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22240 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22241 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22242 = "torch.aten.slice.Tensor"(%22237, %22238, %22239, %22240, %22241) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22242, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22243 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22244 = "torch.aten.unsqueeze"(%22242, %22243) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22244, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22245 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22246 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22247 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22248 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22249 = "torch.aten.slice.Tensor"(%22244, %22245, %22246, %22247, %22248) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22249, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22250 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22251 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22252 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22253 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22254 = "torch.prim.ListConstruct"(%22250, %22251, %22252, %22253) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22255 = "torch.aten.repeat"(%22249, %22254) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22255, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22256 = "torch.aten.mul.Tensor"(%21968, %22235) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22256, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22257 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22258 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22259 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22260 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22261 = "torch.aten.slice.Tensor"(%21968, %22257, %22258, %22259, %22260) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22261, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22262 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22263 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22264 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22265 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22266 = "torch.aten.slice.Tensor"(%21968, %22262, %22263, %22264, %22265) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22266, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22267 = "torch.aten.neg"(%22266) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22267, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22268 = "torch.prim.ListConstruct"(%22267, %22261) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22269 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22270 = "torch.aten.cat"(%22268, %22269) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22270, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22271 = "torch.aten.mul.Tensor"(%22270, %22255) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22271, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22272 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22273 = "torch.aten.add.Tensor"(%22256, %22271, %22272) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22273, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22274 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22275 = "torch.aten.mul.Scalar"(%arg69, %22274) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22275, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22276 = "torch.constant.int"() <{value = 10 : i64}> : () -> !torch.int
    %22277 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22278 = "torch.aten.add.Scalar"(%22275, %22276, %22277) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22278, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22279 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22280 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22281 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22282 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22283 = "torch.prim.ListConstruct"(%22279, %18477, %22280, %22281, %22282) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22284 = "torch.aten.view"(%22273, %22283) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22284, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22285 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22286 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22287 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22288 = "torch.prim.ListConstruct"(%19011, %22285, %22286, %22287) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22289 = "torch.aten.view"(%22284, %22288) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22289, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22290 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %22291 = "torch.aten.view"(%22278, %22290) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%22291, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %22292 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22293 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22294 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22295 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22296 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22297 = "torch.prim.ListConstruct"(%18479, %22292, %22293, %22294, %22295, %22296) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22298 = "torch.aten.view"(%21700, %22297) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22298, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22299 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22300 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22301 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22302 = "torch.prim.ListConstruct"(%18993, %22299, %22300, %22301) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22303 = "torch.aten.view"(%22298, %22302) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22303, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22304 = "torch.prim.ListConstruct"(%22291) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %22305 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22306 = "torch.aten.index_put"(%22303, %22304, %22289, %22305) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22306, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22307 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22308 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22309 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22310 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22311 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22312 = "torch.prim.ListConstruct"(%18479, %22307, %22308, %22309, %22310, %22311) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22313 = "torch.aten.view"(%22306, %22312) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22313, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22314 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %22315 = "torch.prim.ListConstruct"(%18479, %22314) : (!torch.int, !torch.int) -> !torch.list<int>
    %22316 = "torch.aten.view"(%22313, %22315) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22316, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %22317 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22318 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22319 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22320 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22321 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22322 = "torch.prim.ListConstruct"(%18479, %22317, %22318, %22319, %22320, %22321) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22323 = "torch.aten.view"(%22316, %22322) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22323, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22324 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22325 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22326 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22327 = "torch.prim.ListConstruct"(%18993, %22324, %22325, %22326) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22328 = "torch.aten.view"(%22323, %22327) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22328, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22329 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22330 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22331 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22332 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22333 = "torch.prim.ListConstruct"(%22329, %18477, %22330, %22331, %22332) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22334 = "torch.aten.view"(%21973, %22333) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22334, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22335 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22336 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22337 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22338 = "torch.prim.ListConstruct"(%19011, %22335, %22336, %22337) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22339 = "torch.aten.view"(%22334, %22338) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22339, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22340 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22341 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22342 = "torch.aten.add.Scalar"(%22278, %22340, %22341) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22342, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22343 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %22344 = "torch.aten.view"(%22342, %22343) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%22344, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %22345 = "torch.prim.ListConstruct"(%22344) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %22346 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22347 = "torch.aten.index_put"(%22328, %22345, %22339, %22346) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22347, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22348 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22349 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22350 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22351 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22352 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22353 = "torch.prim.ListConstruct"(%18479, %22348, %22349, %22350, %22351, %22352) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22354 = "torch.aten.view"(%22347, %22353) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22354, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22355 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %22356 = "torch.prim.ListConstruct"(%18479, %22355) : (!torch.int, !torch.int) -> !torch.list<int>
    %22357 = "torch.aten.view"(%22354, %22356) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22357, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %22358 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %22359 = "torch.aten.unsqueeze"(%22273, %22358) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22359, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22360 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22361 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22362 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22363 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22364 = "torch.prim.ListConstruct"(%22360, %18481, %22361, %22362, %22363) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22365 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22366 = "torch.aten.expand"(%22359, %22364, %22365) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22366, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22367 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22368 = "torch.aten.clone"(%22366, %22367) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22368, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22369 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22370 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22371 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22372 = "torch.prim.ListConstruct"(%22369, %18481, %22370, %22371) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22373 = "torch.aten._unsafe_view"(%22368, %22372) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22373, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22374 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %22375 = "torch.aten.unsqueeze"(%21973, %22374) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22375, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22376 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22377 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22378 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22379 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22380 = "torch.prim.ListConstruct"(%22376, %18481, %22377, %22378, %22379) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22381 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22382 = "torch.aten.expand"(%22375, %22380, %22381) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22382, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22383 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22384 = "torch.aten.clone"(%22382, %22383) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22384, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22385 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22386 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22387 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22388 = "torch.prim.ListConstruct"(%22385, %18481, %22386, %22387) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22389 = "torch.aten._unsafe_view"(%22384, %22388) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22389, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22390 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22391 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22392 = "torch.aten.transpose.int"(%22123, %22390, %22391) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22392, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22393 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22394 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22395 = "torch.aten.transpose.int"(%22373, %22393, %22394) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22395, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22396 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22397 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22398 = "torch.aten.transpose.int"(%22389, %22396, %22397) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int) -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22398, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22399 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22400 = "torch.aten.squeeze.dim"(%18570, %22399) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22400, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %22401 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22402 = "torch.aten.squeeze.dim"(%22400, %22401) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22402, %18474) <{shape_expressions = #map8}> : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int) -> ()
    %22403 = "torch_c.to_builtin_tensor"(%22392) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %22404 = "torch_c.to_builtin_tensor"(%22395) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %22405 = "torch_c.to_builtin_tensor"(%22398) : (!torch.vtensor<[4,32,?,128],f8E4M3FNUZ>) -> tensor<4x32x?x128xf8E4M3FNUZ>
    %22406 = "torch_c.to_builtin_tensor"(%22402) : (!torch.vtensor<[4,1,?,?],f8E4M3FNUZ>) -> tensor<4x1x?x?xf8E4M3FNUZ>
    %22407 = "tensor.cast"(%22406) : (tensor<4x1x?x?xf8E4M3FNUZ>) -> tensor<?x?x?x?xf8E4M3FNUZ>
    %22408 = "torch_c.to_builtin_tensor"(%17409) : (!torch.vtensor<[],f32>) -> tensor<f32>
    %22409 = "util.call"(%22403, %22404, %22405, %22408, %22407) <{callee = @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f32_f32}> : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?x?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %22410 = "torch_c.from_builtin_tensor"(%22409) : (tensor<4x32x?x128xf32>) -> !torch.vtensor<[4,32,?,128],f32>
    "torch.bind_symbolic_shape"(%22410, %18474) <{shape_expressions = #map27}> : (!torch.vtensor<[4,32,?,128],f32>, !torch.int) -> ()
    %22411 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22412 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22413 = "torch.aten.transpose.int"(%22410, %22411, %22412) : (!torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%22413, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %22414 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22415 = "torch.aten.clone"(%22413, %22414) : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> !torch.vtensor<[4,?,32,128],f32>
    "torch.bind_symbolic_shape"(%22415, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f32>, !torch.int) -> ()
    %22416 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22417 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22418 = "torch.prim.ListConstruct"(%22416, %18481, %22417) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22419 = "torch.aten._unsafe_view"(%22415, %22418) : (!torch.vtensor<[4,?,32,128],f32>, !torch.list<int>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22419, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22420 = "torch.aten.div.Tensor"(%22419, %17411) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22420, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22421 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22422 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22423 = "torch.aten.clamp"(%22420, %22421, %22422) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22423, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22424 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22425 = "torch.prims.convert_element_type"(%22423, %22424) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22425, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22426 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22427 = "torch.aten.unsqueeze"(%17413, %22426) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %22428 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22429 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22430 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22431 = "torch.prim.ListConstruct"(%22428, %22429, %22430) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22432 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22433 = "torch.aten.expand"(%22427, %22431, %22432) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %22434 = "torch_c.to_builtin_tensor"(%22425) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22435 = "torch_c.to_builtin_tensor"(%22433) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %22436 = "util.call"(%22434, %22435) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %22437 = "torch_c.from_builtin_tensor"(%22436) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22437, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22438 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22439 = "torch.prims.convert_element_type"(%22437, %22438) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22439, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22440 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22441 = "torch.aten.add.Tensor"(%21867, %22439, %22440) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22441, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22442 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22443 = "torch.prims.convert_element_type"(%22441, %22442) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22443, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22444 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22445 = "torch.aten.pow.Tensor_Scalar"(%22443, %22444) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22445, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22446 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22447 = "torch.prim.ListConstruct"(%22446) : (!torch.int) -> !torch.list<int>
    %22448 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %22449 = "torch.constant.none"() : () -> !torch.none
    %22450 = "torch.aten.mean.dim"(%22445, %22447, %22448, %22449) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22450, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22451 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %22452 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22453 = "torch.aten.add.Scalar"(%22450, %22451, %22452) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22453, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22454 = "torch.aten.rsqrt"(%22453) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22454, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22455 = "torch.aten.mul.Tensor"(%22443, %22454) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22455, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22456 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22457 = "torch.prims.convert_element_type"(%22455, %22456) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22457, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22458 = "torch.aten.mul.Tensor"(%17415, %22457) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22458, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22459 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22460 = "torch.prims.convert_element_type"(%22458, %22459) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22460, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22461 = "torch.aten.div.Tensor"(%22460, %17417) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22461, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22462 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22463 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22464 = "torch.aten.clamp"(%22461, %22462, %22463) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22464, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22465 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22466 = "torch.prims.convert_element_type"(%22464, %22465) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22466, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22467 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22468 = "torch.aten.unsqueeze"(%17419, %22467) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %22469 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22470 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %22471 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22472 = "torch.prim.ListConstruct"(%22469, %22470, %22471) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22473 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22474 = "torch.aten.expand"(%22468, %22472, %22473) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %22475 = "torch_c.to_builtin_tensor"(%22466) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22476 = "torch_c.to_builtin_tensor"(%22474) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %22477 = "util.call"(%22475, %22476) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %22478 = "torch_c.from_builtin_tensor"(%22477) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%22478, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %22479 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22480 = "torch.prims.convert_element_type"(%22478, %22479) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22480, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22481 = "torch.aten.silu"(%22480) : (!torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22481, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22482 = "torch.aten.div.Tensor"(%22460, %17421) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22482, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22483 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22484 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22485 = "torch.aten.clamp"(%22482, %22483, %22484) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22485, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22486 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22487 = "torch.prims.convert_element_type"(%22485, %22486) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22487, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22488 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22489 = "torch.aten.unsqueeze"(%17423, %22488) : (!torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %22490 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22491 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %22492 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22493 = "torch.prim.ListConstruct"(%22490, %22491, %22492) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22494 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22495 = "torch.aten.expand"(%22489, %22493, %22494) : (!torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %22496 = "torch_c.to_builtin_tensor"(%22487) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22497 = "torch_c.to_builtin_tensor"(%22495) : (!torch.vtensor<[4,14336,4096],f8E4M3FNUZ>) -> tensor<4x14336x4096xf8E4M3FNUZ>
    %22498 = "util.call"(%22496, %22497) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %22499 = "torch_c.from_builtin_tensor"(%22498) : (tensor<4x?x14336xf32>) -> !torch.vtensor<[4,?,14336],f32>
    "torch.bind_symbolic_shape"(%22499, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> ()
    %22500 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22501 = "torch.prims.convert_element_type"(%22499, %22500) : (!torch.vtensor<[4,?,14336],f32>, !torch.int) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22501, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22502 = "torch.aten.mul.Tensor"(%22481, %22501) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[4,?,14336],bf16>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22502, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22503 = "torch.aten.div.Tensor"(%22502, %17425) : (!torch.vtensor<[4,?,14336],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22503, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22504 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22505 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22506 = "torch.aten.clamp"(%22503, %22504, %22505) : (!torch.vtensor<[4,?,14336],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,14336],bf16>
    "torch.bind_symbolic_shape"(%22506, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> ()
    %22507 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22508 = "torch.prims.convert_element_type"(%22506, %22507) : (!torch.vtensor<[4,?,14336],bf16>, !torch.int) -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22508, %18474) <{shape_expressions = #map28}> : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>, !torch.int) -> ()
    %22509 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22510 = "torch.aten.unsqueeze"(%17427, %22509) : (!torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %22511 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22512 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22513 = "torch.constant.int"() <{value = 14336 : i64}> : () -> !torch.int
    %22514 = "torch.prim.ListConstruct"(%22511, %22512, %22513) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22515 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22516 = "torch.aten.expand"(%22510, %22514, %22515) : (!torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %22517 = "torch_c.to_builtin_tensor"(%22508) : (!torch.vtensor<[4,?,14336],f8E4M3FNUZ>) -> tensor<4x?x14336xf8E4M3FNUZ>
    %22518 = "torch_c.to_builtin_tensor"(%22516) : (!torch.vtensor<[4,4096,14336],f8E4M3FNUZ>) -> tensor<4x4096x14336xf8E4M3FNUZ>
    %22519 = "util.call"(%22517, %22518) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ}> : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %22520 = "torch_c.from_builtin_tensor"(%22519) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22520, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22521 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22522 = "torch.prims.convert_element_type"(%22520, %22521) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22522, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22523 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22524 = "torch.aten.add.Tensor"(%22441, %22522, %22523) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22524, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22525 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22526 = "torch.prims.convert_element_type"(%22524, %22525) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22526, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22527 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22528 = "torch.aten.pow.Tensor_Scalar"(%22526, %22527) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22528, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22529 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22530 = "torch.prim.ListConstruct"(%22529) : (!torch.int) -> !torch.list<int>
    %22531 = "torch.constant.bool"() <{value = true}> : () -> !torch.bool
    %22532 = "torch.constant.none"() : () -> !torch.none
    %22533 = "torch.aten.mean.dim"(%22528, %22530, %22531, %22532) : (!torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22533, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22534 = "torch.constant.float"() <{value = 1.000000e-05 : f64}> : () -> !torch.float
    %22535 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22536 = "torch.aten.add.Scalar"(%22533, %22534, %22535) : (!torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22536, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22537 = "torch.aten.rsqrt"(%22536) : (!torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,1],f32>
    "torch.bind_symbolic_shape"(%22537, %18474) <{shape_expressions = #map10}> : (!torch.vtensor<[4,?,1],f32>, !torch.int) -> ()
    %22538 = "torch.aten.mul.Tensor"(%22526, %22537) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22538, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22539 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22540 = "torch.prims.convert_element_type"(%22538, %22539) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22540, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22541 = "torch.aten.mul.Tensor"(%17429, %22540) : (!torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22541, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22542 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22543 = "torch.prims.convert_element_type"(%22541, %22542) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22543, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22544 = "torch.aten.div.Tensor"(%22543, %17431) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22544, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22545 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22546 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22547 = "torch.aten.clamp"(%22544, %22545, %22546) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22547, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22548 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22549 = "torch.prims.convert_element_type"(%22547, %22548) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22549, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22550 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22551 = "torch.aten.unsqueeze"(%17433, %22550) : (!torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %22552 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22553 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22554 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22555 = "torch.prim.ListConstruct"(%22552, %22553, %22554) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22556 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22557 = "torch.aten.expand"(%22551, %22555, %22556) : (!torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %22558 = "torch_c.to_builtin_tensor"(%22549) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22559 = "torch_c.to_builtin_tensor"(%22557) : (!torch.vtensor<[4,4096,4096],f8E4M3FNUZ>) -> tensor<4x4096x4096xf8E4M3FNUZ>
    %22560 = "util.call"(%22558, %22559) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %22561 = "torch_c.from_builtin_tensor"(%22560) : (tensor<4x?x4096xf32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22561, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22562 = "torch.aten.div.Tensor"(%22561, %17435) : (!torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22562, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22563 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22564 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22565 = "torch.aten.clamp"(%22562, %22563, %22564) : (!torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],f32>
    "torch.bind_symbolic_shape"(%22565, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> ()
    %22566 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22567 = "torch.prims.convert_element_type"(%22565, %22566) : (!torch.vtensor<[4,?,4096],f32>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22567, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22568 = "torch.aten.div.Tensor"(%22543, %17437) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22568, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22569 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22570 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22571 = "torch.aten.clamp"(%22568, %22569, %22570) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22571, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22572 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22573 = "torch.prims.convert_element_type"(%22571, %22572) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22573, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22574 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22575 = "torch.aten.unsqueeze"(%17439, %22574) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %22576 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22577 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %22578 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22579 = "torch.prim.ListConstruct"(%22576, %22577, %22578) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22580 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22581 = "torch.aten.expand"(%22575, %22579, %22580) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %22582 = "torch_c.to_builtin_tensor"(%22573) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22583 = "torch_c.to_builtin_tensor"(%22581) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %22584 = "util.call"(%22582, %22583) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %22585 = "torch_c.from_builtin_tensor"(%22584) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22585, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22586 = "torch.aten.div.Tensor"(%22585, %17441) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22586, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22587 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22588 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22589 = "torch.aten.clamp"(%22586, %22587, %22588) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22589, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22590 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22591 = "torch.prims.convert_element_type"(%22589, %22590) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22591, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %22592 = "torch.aten.div.Tensor"(%22543, %17443) : (!torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22592, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22593 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22594 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22595 = "torch.aten.clamp"(%22592, %22593, %22594) : (!torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,4096],bf16>
    "torch.bind_symbolic_shape"(%22595, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> ()
    %22596 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22597 = "torch.prims.convert_element_type"(%22595, %22596) : (!torch.vtensor<[4,?,4096],bf16>, !torch.int) -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22597, %18474) <{shape_expressions = #map9}> : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.int) -> ()
    %22598 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22599 = "torch.aten.unsqueeze"(%17445, %22598) : (!torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %22600 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22601 = "torch.constant.int"() <{value = 1024 : i64}> : () -> !torch.int
    %22602 = "torch.constant.int"() <{value = 4096 : i64}> : () -> !torch.int
    %22603 = "torch.prim.ListConstruct"(%22600, %22601, %22602) : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22604 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22605 = "torch.aten.expand"(%22599, %22603, %22604) : (!torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %22606 = "torch_c.to_builtin_tensor"(%22597) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>) -> tensor<4x?x4096xf8E4M3FNUZ>
    %22607 = "torch_c.to_builtin_tensor"(%22605) : (!torch.vtensor<[4,1024,4096],f8E4M3FNUZ>) -> tensor<4x1024x4096xf8E4M3FNUZ>
    %22608 = "util.call"(%22606, %22607) <{callee = @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ}> : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %22609 = "torch_c.from_builtin_tensor"(%22608) : (tensor<4x?x1024xf32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22609, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22610 = "torch.aten.div.Tensor"(%22609, %17447) : (!torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22610, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22611 = "torch.constant.float"() <{value = -2.400000e+02 : f64}> : () -> !torch.float
    %22612 = "torch.constant.float"() <{value = 2.400000e+02 : f64}> : () -> !torch.float
    %22613 = "torch.aten.clamp"(%22610, %22611, %22612) : (!torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float) -> !torch.vtensor<[4,?,1024],f32>
    "torch.bind_symbolic_shape"(%22613, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> ()
    %22614 = "torch.constant.int"() <{value = 26 : i64}> : () -> !torch.int
    %22615 = "torch.prims.convert_element_type"(%22613, %22614) : (!torch.vtensor<[4,?,1024],f32>, !torch.int) -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22615, %18474) <{shape_expressions = #map11}> : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.int) -> ()
    %22616 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22617 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22618 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22619 = "torch.prim.ListConstruct"(%22616, %18481, %22617, %22618) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22620 = "torch.aten.view"(%22567, %22619) : (!torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22620, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22621 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22622 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22623 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22624 = "torch.prim.ListConstruct"(%22621, %18481, %22622, %22623) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22625 = "torch.aten.view"(%22591, %22624) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22625, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22626 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22627 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22628 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22629 = "torch.prim.ListConstruct"(%22626, %18481, %22627, %22628) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22630 = "torch.aten.view"(%22615, %22629) : (!torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22630, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22631 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22632 = "torch.constant.none"() : () -> !torch.none
    %22633 = "torch.constant.none"() : () -> !torch.none
    %22634 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22635 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22636 = "torch.aten.arange"(%22631, %22632, %22633, %22634, %22635) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %22637 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22638 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22639 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22640 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22641 = "torch.constant.none"() : () -> !torch.none
    %22642 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22643 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22644 = "torch.aten.arange.start_step"(%22637, %22638, %22639, %22640, %22641, %22642, %22643) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %22645 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22646 = "torch.prims.convert_element_type"(%22644, %22645) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %22647 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22648 = "torch.aten.div.Scalar"(%22646, %22647) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22649 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %22650 = "torch.aten.pow.Scalar"(%22649, %22648) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22651 = "torch.aten.reciprocal"(%22650) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22652 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %22653 = "torch.aten.mul.Scalar"(%22651, %22652) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22654 = "torch.aten.reciprocal"(%22653) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22655 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %22656 = "torch.aten.mul.Scalar"(%22654, %22655) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22657 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22658 = "torch.aten.gt.Scalar"(%22656, %22657) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22659 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22660 = "torch.aten.div.Scalar"(%22653, %22659) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22661 = "torch.aten.where.self"(%22658, %22660, %22653) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22662 = "torch.aten.reciprocal"(%22656) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22663 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22664 = "torch.aten.mul.Scalar"(%22662, %22663) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22665 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22666 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22667 = "torch.aten.sub.Scalar"(%22664, %22665, %22666) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22668 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22669 = "torch.aten.div.Scalar"(%22667, %22668) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22670 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22671 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22672 = "torch.aten.rsub.Scalar"(%22669, %22670, %22671) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22673 = "torch.aten.mul.Tensor"(%22672, %22661) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22674 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22675 = "torch.aten.div.Scalar"(%22673, %22674) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22676 = "torch.aten.mul.Tensor"(%22669, %22661) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22677 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22678 = "torch.aten.add.Tensor"(%22675, %22676, %22677) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22679 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22680 = "torch.aten.lt.Scalar"(%22656, %22679) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22681 = "torch.aten.bitwise_not"(%22680) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22682 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22683 = "torch.aten.gt.Scalar"(%22656, %22682) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22684 = "torch.aten.bitwise_not"(%22683) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22685 = "torch.aten.mul.Tensor"(%22681, %22684) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22686 = "torch.aten.where.self"(%22685, %22678, %22661) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22687 = "torch.prim.ListConstruct"(%22686, %22686) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22688 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22689 = "torch.aten.cat"(%22687, %22688) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22690 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22691 = "torch.prims.convert_element_type"(%22636, %22690) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22692 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22693 = "torch.prims.convert_element_type"(%22689, %22692) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22694 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22695 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22696 = "torch.prim.ListConstruct"(%22694, %22695) : (!torch.int, !torch.int) -> !torch.list<int>
    %22697 = "torch.aten.view"(%22691, %22696) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22698 = "torch.aten.mul.Tensor"(%22697, %22693) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22699 = "torch.aten.cos"(%22698) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22700 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22701 = "torch.prims.convert_element_type"(%22699, %22700) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22702 = "torch.aten.sin"(%22698) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22703 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22704 = "torch.prims.convert_element_type"(%22702, %22703) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22705 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22706 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22707 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22708 = "torch.aten.slice.Tensor"(%22701, %22705, %22706, %18481, %22707) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22708, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22709 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22710 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22711 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22712 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22713 = "torch.aten.slice.Tensor"(%22708, %22709, %22710, %22711, %22712) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22713, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22714 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22715 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22716 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22717 = "torch.aten.slice.Tensor"(%22704, %22714, %22715, %18481, %22716) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22717, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22718 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22719 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22720 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22721 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22722 = "torch.aten.slice.Tensor"(%22717, %22718, %22719, %22720, %22721) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22722, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22723 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22724 = "torch.aten.unsqueeze"(%22713, %22723) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22724, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22725 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22726 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22727 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22728 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22729 = "torch.aten.slice.Tensor"(%22724, %22725, %22726, %22727, %22728) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22729, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22730 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22731 = "torch.aten.unsqueeze"(%22729, %22730) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22731, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22732 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22733 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22734 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22735 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22736 = "torch.aten.slice.Tensor"(%22731, %22732, %22733, %22734, %22735) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22736, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22737 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22738 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22739 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22740 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22741 = "torch.prim.ListConstruct"(%22737, %22738, %22739, %22740) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22742 = "torch.aten.repeat"(%22736, %22741) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22742, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22743 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22744 = "torch.aten.unsqueeze"(%22722, %22743) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22744, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22745 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22746 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22747 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22748 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22749 = "torch.aten.slice.Tensor"(%22744, %22745, %22746, %22747, %22748) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22749, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22750 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22751 = "torch.aten.unsqueeze"(%22749, %22750) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22751, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22752 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22753 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22754 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22755 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22756 = "torch.aten.slice.Tensor"(%22751, %22752, %22753, %22754, %22755) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22756, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22757 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22758 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22759 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22760 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22761 = "torch.prim.ListConstruct"(%22757, %22758, %22759, %22760) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22762 = "torch.aten.repeat"(%22756, %22761) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22762, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22763 = "torch.aten.mul.Tensor"(%22620, %22742) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22763, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22764 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22765 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22766 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22767 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22768 = "torch.aten.slice.Tensor"(%22620, %22764, %22765, %22766, %22767) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22768, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22769 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22770 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22771 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22772 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22773 = "torch.aten.slice.Tensor"(%22620, %22769, %22770, %22771, %22772) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22773, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22774 = "torch.aten.neg"(%22773) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22774, %18474) <{shape_expressions = #map18}> : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22775 = "torch.prim.ListConstruct"(%22774, %22768) : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22776 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22777 = "torch.aten.cat"(%22775, %22776) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22777, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22778 = "torch.aten.mul.Tensor"(%22777, %22762) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22778, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22779 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22780 = "torch.aten.add.Tensor"(%22763, %22778, %22779) : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22780, %18474) <{shape_expressions = #map12}> : (!torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22781 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22782 = "torch.constant.none"() : () -> !torch.none
    %22783 = "torch.constant.none"() : () -> !torch.none
    %22784 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22785 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22786 = "torch.aten.arange"(%22781, %22782, %22783, %22784, %22785) : (!torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[131072],si64>
    %22787 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22788 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22789 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22790 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22791 = "torch.constant.none"() : () -> !torch.none
    %22792 = "torch.constant.device"() <{value = "cpu"}> : () -> !torch.Device
    %22793 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22794 = "torch.aten.arange.start_step"(%22787, %22788, %22789, %22790, %22791, %22792, %22793) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool) -> !torch.vtensor<[64],si64>
    %22795 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22796 = "torch.prims.convert_element_type"(%22794, %22795) : (!torch.vtensor<[64],si64>, !torch.int) -> !torch.vtensor<[64],f32>
    %22797 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22798 = "torch.aten.div.Scalar"(%22796, %22797) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22799 = "torch.constant.float"() <{value = 5.000000e+05 : f64}> : () -> !torch.float
    %22800 = "torch.aten.pow.Scalar"(%22799, %22798) : (!torch.float, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22801 = "torch.aten.reciprocal"(%22800) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22802 = "torch.constant.float"() <{value = 1.000000e+00 : f64}> : () -> !torch.float
    %22803 = "torch.aten.mul.Scalar"(%22801, %22802) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22804 = "torch.aten.reciprocal"(%22803) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22805 = "torch.constant.float"() <{value = 6.2831853071795862 : f64}> : () -> !torch.float
    %22806 = "torch.aten.mul.Scalar"(%22804, %22805) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],f32>
    %22807 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22808 = "torch.aten.gt.Scalar"(%22806, %22807) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22809 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22810 = "torch.aten.div.Scalar"(%22803, %22809) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22811 = "torch.aten.where.self"(%22808, %22810, %22803) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22812 = "torch.aten.reciprocal"(%22806) : (!torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22813 = "torch.constant.int"() <{value = 8192 : i64}> : () -> !torch.int
    %22814 = "torch.aten.mul.Scalar"(%22812, %22813) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22815 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22816 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22817 = "torch.aten.sub.Scalar"(%22814, %22815, %22816) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22818 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22819 = "torch.aten.div.Scalar"(%22817, %22818) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22820 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22821 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22822 = "torch.aten.rsub.Scalar"(%22819, %22820, %22821) : (!torch.vtensor<[64],f32>, !torch.int, !torch.int) -> !torch.vtensor<[64],f32>
    %22823 = "torch.aten.mul.Tensor"(%22822, %22811) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22824 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22825 = "torch.aten.div.Scalar"(%22823, %22824) : (!torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22826 = "torch.aten.mul.Tensor"(%22819, %22811) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22827 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22828 = "torch.aten.add.Tensor"(%22825, %22826, %22827) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int) -> !torch.vtensor<[64],f32>
    %22829 = "torch.constant.float"() <{value = 2.048000e+03 : f64}> : () -> !torch.float
    %22830 = "torch.aten.lt.Scalar"(%22806, %22829) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22831 = "torch.aten.bitwise_not"(%22830) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22832 = "torch.constant.float"() <{value = 8.192000e+03 : f64}> : () -> !torch.float
    %22833 = "torch.aten.gt.Scalar"(%22806, %22832) : (!torch.vtensor<[64],f32>, !torch.float) -> !torch.vtensor<[64],i1>
    %22834 = "torch.aten.bitwise_not"(%22833) : (!torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22835 = "torch.aten.mul.Tensor"(%22831, %22834) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],i1>) -> !torch.vtensor<[64],i1>
    %22836 = "torch.aten.where.self"(%22835, %22828, %22811) : (!torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.vtensor<[64],f32>
    %22837 = "torch.prim.ListConstruct"(%22836, %22836) : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %22838 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22839 = "torch.aten.cat"(%22837, %22838) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[128],f32>
    %22840 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22841 = "torch.prims.convert_element_type"(%22786, %22840) : (!torch.vtensor<[131072],si64>, !torch.int) -> !torch.vtensor<[131072],f32>
    %22842 = "torch.constant.int"() <{value = 6 : i64}> : () -> !torch.int
    %22843 = "torch.prims.convert_element_type"(%22839, %22842) : (!torch.vtensor<[128],f32>, !torch.int) -> !torch.vtensor<[128],f32>
    %22844 = "torch.constant.int"() <{value = 131072 : i64}> : () -> !torch.int
    %22845 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22846 = "torch.prim.ListConstruct"(%22844, %22845) : (!torch.int, !torch.int) -> !torch.list<int>
    %22847 = "torch.aten.view"(%22841, %22846) : (!torch.vtensor<[131072],f32>, !torch.list<int>) -> !torch.vtensor<[131072,1],f32>
    %22848 = "torch.aten.mul.Tensor"(%22847, %22843) : (!torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22849 = "torch.aten.cos"(%22848) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22850 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22851 = "torch.prims.convert_element_type"(%22849, %22850) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22852 = "torch.aten.sin"(%22848) : (!torch.vtensor<[131072,128],f32>) -> !torch.vtensor<[131072,128],f32>
    %22853 = "torch.constant.int"() <{value = 15 : i64}> : () -> !torch.int
    %22854 = "torch.prims.convert_element_type"(%22852, %22853) : (!torch.vtensor<[131072,128],f32>, !torch.int) -> !torch.vtensor<[131072,128],bf16>
    %22855 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22856 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22857 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22858 = "torch.aten.slice.Tensor"(%22851, %22855, %22856, %18481, %22857) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22858, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22859 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22860 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22861 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22862 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22863 = "torch.aten.slice.Tensor"(%22858, %22859, %22860, %22861, %22862) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22863, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22864 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22865 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22866 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22867 = "torch.aten.slice.Tensor"(%22854, %22864, %22865, %18481, %22866) : (!torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22867, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22868 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22869 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22870 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22871 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22872 = "torch.aten.slice.Tensor"(%22867, %22868, %22869, %22870, %22871) : (!torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[?,128],bf16>
    "torch.bind_symbolic_shape"(%22872, %18474) <{shape_expressions = #map14}> : (!torch.vtensor<[?,128],bf16>, !torch.int) -> ()
    %22873 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22874 = "torch.aten.unsqueeze"(%22863, %22873) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22874, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22875 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22876 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22877 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22878 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22879 = "torch.aten.slice.Tensor"(%22874, %22875, %22876, %22877, %22878) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22879, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22880 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22881 = "torch.aten.unsqueeze"(%22879, %22880) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22881, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22882 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22883 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22884 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22885 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22886 = "torch.aten.slice.Tensor"(%22881, %22882, %22883, %22884, %22885) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22886, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22887 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22888 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22889 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22890 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22891 = "torch.prim.ListConstruct"(%22887, %22888, %22889, %22890) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22892 = "torch.aten.repeat"(%22886, %22891) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22892, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22893 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22894 = "torch.aten.unsqueeze"(%22872, %22893) : (!torch.vtensor<[?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22894, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22895 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22896 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22897 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22898 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22899 = "torch.aten.slice.Tensor"(%22894, %22895, %22896, %22897, %22898) : (!torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,128],bf16>
    "torch.bind_symbolic_shape"(%22899, %18474) <{shape_expressions = #map15}> : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> ()
    %22900 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22901 = "torch.aten.unsqueeze"(%22899, %22900) : (!torch.vtensor<[1,?,128],bf16>, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22901, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22902 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22903 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22904 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22905 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22906 = "torch.aten.slice.Tensor"(%22901, %22902, %22903, %22904, %22905) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[1,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22906, %18474) <{shape_expressions = #map16}> : (!torch.vtensor<[1,?,1,128],bf16>, !torch.int) -> ()
    %22907 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22908 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22909 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22910 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22911 = "torch.prim.ListConstruct"(%22907, %22908, %22909, %22910) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22912 = "torch.aten.repeat"(%22906, %22911) : (!torch.vtensor<[1,?,1,128],bf16>, !torch.list<int>) -> !torch.vtensor<[4,?,1,128],bf16>
    "torch.bind_symbolic_shape"(%22912, %18474) <{shape_expressions = #map17}> : (!torch.vtensor<[4,?,1,128],bf16>, !torch.int) -> ()
    %22913 = "torch.aten.mul.Tensor"(%22625, %22892) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22913, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22914 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22915 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %22916 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22917 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22918 = "torch.aten.slice.Tensor"(%22625, %22914, %22915, %22916, %22917) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22918, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22919 = "torch.constant.int"() <{value = 3 : i64}> : () -> !torch.int
    %22920 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22921 = "torch.constant.int"() <{value = 9223372036854775807 : i64}> : () -> !torch.int
    %22922 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22923 = "torch.aten.slice.Tensor"(%22625, %22919, %22920, %22921, %22922) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22923, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22924 = "torch.aten.neg"(%22923) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22924, %18474) <{shape_expressions = #map19}> : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.int) -> ()
    %22925 = "torch.prim.ListConstruct"(%22924, %22918) : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %22926 = "torch.constant.int"() <{value = -1 : i64}> : () -> !torch.int
    %22927 = "torch.aten.cat"(%22925, %22926) : (!torch.list<vtensor>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22927, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22928 = "torch.aten.mul.Tensor"(%22927, %22912) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16>) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22928, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22929 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22930 = "torch.aten.add.Tensor"(%22913, %22928, %22929) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22930, %18474) <{shape_expressions = #map13}> : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22931 = "torch.constant.int"() <{value = 64 : i64}> : () -> !torch.int
    %22932 = "torch.aten.mul.Scalar"(%arg69, %22931) : (!torch.vtensor<[4,?],si64>, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22932, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22933 = "torch.constant.int"() <{value = 12 : i64}> : () -> !torch.int
    %22934 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22935 = "torch.aten.add.Scalar"(%22932, %22933, %22934) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22935, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %22936 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22937 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22938 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22939 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22940 = "torch.prim.ListConstruct"(%22936, %18477, %22937, %22938, %22939) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22941 = "torch.aten.view"(%22930, %22940) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22941, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22942 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22943 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22944 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22945 = "torch.prim.ListConstruct"(%19011, %22942, %22943, %22944) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22946 = "torch.aten.view"(%22941, %22945) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22946, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22947 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %22948 = "torch.aten.view"(%22935, %22947) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%22948, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %22949 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22950 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22951 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22952 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22953 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22954 = "torch.prim.ListConstruct"(%18479, %22949, %22950, %22951, %22952, %22953) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22955 = "torch.aten.view"(%22357, %22954) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22955, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22956 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22957 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22958 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22959 = "torch.prim.ListConstruct"(%18993, %22956, %22957, %22958) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22960 = "torch.aten.view"(%22955, %22959) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22960, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22961 = "torch.prim.ListConstruct"(%22948) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %22962 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %22963 = "torch.aten.index_put"(%22960, %22961, %22946, %22962) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22963, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22964 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22965 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22966 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22967 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22968 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22969 = "torch.prim.ListConstruct"(%18479, %22964, %22965, %22966, %22967, %22968) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22970 = "torch.aten.view"(%22963, %22969) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22970, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22971 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %22972 = "torch.prim.ListConstruct"(%18479, %22971) : (!torch.int, !torch.int) -> !torch.list<int>
    %22973 = "torch.aten.view"(%22970, %22972) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22973, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %22974 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22975 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %22976 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22977 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22978 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22979 = "torch.prim.ListConstruct"(%18479, %22974, %22975, %22976, %22977, %22978) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22980 = "torch.aten.view"(%22973, %22979) : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22980, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22981 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22982 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22983 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22984 = "torch.prim.ListConstruct"(%18993, %22981, %22982, %22983) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22985 = "torch.aten.view"(%22980, %22984) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22985, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22986 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %22987 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22988 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22989 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22990 = "torch.prim.ListConstruct"(%22986, %18477, %22987, %22988, %22989) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22991 = "torch.aten.view"(%22630, %22990) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22991, %18474) <{shape_expressions = #map22}> : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22992 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %22993 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %22994 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %22995 = "torch.prim.ListConstruct"(%19011, %22992, %22993, %22994) : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %22996 = "torch.aten.view"(%22991, %22995) : (!torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%22996, %18474) <{shape_expressions = #map23}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %22997 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22998 = "torch.constant.int"() <{value = 1 : i64}> : () -> !torch.int
    %22999 = "torch.aten.add.Scalar"(%22935, %22997, %22998) : (!torch.vtensor<[4,?],si64>, !torch.int, !torch.int) -> !torch.vtensor<[4,?],si64>
    "torch.bind_symbolic_shape"(%22999, %18474) <{shape_expressions = #map1}> : (!torch.vtensor<[4,?],si64>, !torch.int) -> ()
    %23000 = "torch.prim.ListConstruct"(%19011) : (!torch.int) -> !torch.list<int>
    %23001 = "torch.aten.view"(%22999, %23000) : (!torch.vtensor<[4,?],si64>, !torch.list<int>) -> !torch.vtensor<[?],si64>
    "torch.bind_symbolic_shape"(%23001, %18474) <{shape_expressions = #map24}> : (!torch.vtensor<[?],si64>, !torch.int) -> ()
    %23002 = "torch.prim.ListConstruct"(%23001) : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %23003 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23004 = "torch.aten.index_put"(%22985, %23002, %22996, %23003) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool) -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23004, %18475) <{shape_expressions = #map21}> : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23005 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23006 = "torch.constant.int"() <{value = 2 : i64}> : () -> !torch.int
    %23007 = "torch.constant.int"() <{value = 32 : i64}> : () -> !torch.int
    %23008 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23009 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23010 = "torch.prim.ListConstruct"(%18479, %23005, %23006, %23007, %23008, %23009) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23011 = "torch.aten.view"(%23004, %23010) : (!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23011, %18475) <{shape_expressions = #map20}> : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23012 = "torch.constant.int"() <{value = 2097152 : i64}> : () -> !torch.int
    %23013 = "torch.prim.ListConstruct"(%18479, %23012) : (!torch.int, !torch.int) -> !torch.list<int>
    %23014 = "torch.aten.view"(%23011, %23013) : (!torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int>) -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23014, %18475) <{shape_expressions = #map2}> : (!torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int) -> ()
    %23015 = "torch.constant.int"() <{value = -2 : i64}> : () -> !torch.int
    %23016 = "torch.aten.unsqueeze"(%22930, %23015) : (!torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23016, %18474) <{shape_expressions = #map25}> : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23017 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23018 = "torch.constant.int"() <{value = 8 : i64}> : () -> !torch.int
    %23019 = "torch.constant.int"() <{value = 4 : i64}> : () -> !torch.int
    %23020 = "torch.constant.int"() <{value = 128 : i64}> : () -> !torch.int
    %23021 = "torch.prim.ListConstruct"(%23017, %18481, %23018, %23019, %23020) : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %23022 = "torch.constant.bool"() <{value = false}> : () -> !torch.bool
    %23023 = "torch.aten.expand"(%23016, %23021, %23022) : (!torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23023, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23024 = "torch.constant.int"() <{value = 0 : i64}> : () -> !torch.int
    %23025 = "torch.aten.clone"(%23023, %23024) : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    "torch.bind_symbolic_shape"(%23025, %18474) <{shape_expressions = #map26}> : (!torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int) -> ()
    %23026 = "t