pashu123 · May 14, 2025 22:58
diff --git a/8b_fp8.mlir b/8b_fp8.mlir
 #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
 #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>
 #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2)>
 #map3 = affine_map<(d0, d1, d2, d3, d4) -> ()>
 #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)>
 #map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
 module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.0.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.1.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.1.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.2.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.2.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.3.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.3.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.4.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.4.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.5.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.5.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.6.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.6.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.7.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.7.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.8.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.8.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.9.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.9.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.10.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.10.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.11.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.11.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.12.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.12.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.13.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.13.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.14.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.14.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.15.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.15.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.16.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.16.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.17.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.17.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.18.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.18.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.19.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.19.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.20.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.20.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.21.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.21.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.22.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.22.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.23.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.23.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.24.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.24.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.25.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.25.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.26.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.26.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.27.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.27.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.28.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.28.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.29.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.29.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.30.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.30.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.31.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_output:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:d"> : tensor<f32>
  util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.31.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:d"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:d"> : tensor<f32>
  util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16>
  util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16>
  func.func @prefill_bs4(%arg0: !torch.vtensor<[4,?],si64>, %arg1: !torch.vtensor<[4],si64>, %arg2: !torch.vtensor<[4,?],si64>, %arg3: !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,128256],f32> attributes {torch.assume_strict_symbolic_shapes} {
    %__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xbf16>
    %0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
    %__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xbf16>
    %1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.0.attn_q.q_input3Arscale = util.global.load @"__auto.blk.0.attn_q.q_input:rscale" : tensor<f32>
    %2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_q.weight3Aqs = util.global.load @"__auto.blk.0.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_q.q_output3Arscale = util.global.load @"__auto.blk.0.attn_q.q_output:rscale" : tensor<f32>
    %4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_k.q_input3Arscale = util.global.load @"__auto.blk.0.attn_k.q_input:rscale" : tensor<f32>
    %5 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_k.weight3Aqs = util.global.load @"__auto.blk.0.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_k.q_output3Arscale = util.global.load @"__auto.blk.0.attn_k.q_output:rscale" : tensor<f32>
    %7 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_v.q_input3Arscale = util.global.load @"__auto.blk.0.attn_v.q_input:rscale" : tensor<f32>
    %8 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_v.weight3Aqs = util.global.load @"__auto.blk.0.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %9 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_v.q_output3Arscale = util.global.load @"__auto.blk.0.attn_v.q_output:rscale" : tensor<f32>
    %10 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %11 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.0.attn_output.q_input3Arscale = util.global.load @"__auto.blk.0.attn_output.q_input:rscale" : tensor<f32>
    %12 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_output.weight3Aqs = util.global.load @"__auto.blk.0.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %13 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_output.weight3Ad = util.global.load @"__auto.blk.0.attn_output.weight:d" : tensor<f32>
    %14 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xbf16>
    %15 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.0.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_gate.q_input:rscale" : tensor<f32>
    %16 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.0.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %17 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_gate.weight3Ad = util.global.load @"__auto.blk.0.ffn_gate.weight:d" : tensor<f32>
    %18 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_up.q_input:rscale" : tensor<f32>
    %19 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_up.weight3Aqs = util.global.load @"__auto.blk.0.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %20 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_up.weight3Ad = util.global.load @"__auto.blk.0.ffn_up.weight:d" : tensor<f32>
    %21 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_down.q_input:rscale" : tensor<f32>
    %22 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_down.weight3Aqs = util.global.load @"__auto.blk.0.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %23 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.0.ffn_down.weight3Ad = util.global.load @"__auto.blk.0.ffn_down.weight:d" : tensor<f32>
    %24 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xbf16>
    %25 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.1.attn_q.q_input3Arscale = util.global.load @"__auto.blk.1.attn_q.q_input:rscale" : tensor<f32>
    %26 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_q.weight3Aqs = util.global.load @"__auto.blk.1.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %27 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_q.q_output3Arscale = util.global.load @"__auto.blk.1.attn_q.q_output:rscale" : tensor<f32>
    %28 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_k.q_input3Arscale = util.global.load @"__auto.blk.1.attn_k.q_input:rscale" : tensor<f32>
    %29 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_k.weight3Aqs = util.global.load @"__auto.blk.1.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %30 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_k.q_output3Arscale = util.global.load @"__auto.blk.1.attn_k.q_output:rscale" : tensor<f32>
    %31 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_v.q_input3Arscale = util.global.load @"__auto.blk.1.attn_v.q_input:rscale" : tensor<f32>
    %32 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_v.weight3Aqs = util.global.load @"__auto.blk.1.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %33 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_v.q_output3Arscale = util.global.load @"__auto.blk.1.attn_v.q_output:rscale" : tensor<f32>
    %34 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %35 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.1.attn_output.q_input3Arscale = util.global.load @"__auto.blk.1.attn_output.q_input:rscale" : tensor<f32>
    %36 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_output.weight3Aqs = util.global.load @"__auto.blk.1.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %37 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_output.weight3Ad = util.global.load @"__auto.blk.1.attn_output.weight:d" : tensor<f32>
    %38 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xbf16>
    %39 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.1.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_gate.q_input:rscale" : tensor<f32>
    %40 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.1.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %41 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_gate.weight3Ad = util.global.load @"__auto.blk.1.ffn_gate.weight:d" : tensor<f32>
    %42 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_up.q_input:rscale" : tensor<f32>
    %43 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_up.weight3Aqs = util.global.load @"__auto.blk.1.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %44 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_up.weight3Ad = util.global.load @"__auto.blk.1.ffn_up.weight:d" : tensor<f32>
    %45 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_down.q_input:rscale" : tensor<f32>
    %46 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_down.weight3Aqs = util.global.load @"__auto.blk.1.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %47 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.1.ffn_down.weight3Ad = util.global.load @"__auto.blk.1.ffn_down.weight:d" : tensor<f32>
    %48 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xbf16>
    %49 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.2.attn_q.q_input3Arscale = util.global.load @"__auto.blk.2.attn_q.q_input:rscale" : tensor<f32>
    %50 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_q.weight3Aqs = util.global.load @"__auto.blk.2.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %51 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_q.q_output3Arscale = util.global.load @"__auto.blk.2.attn_q.q_output:rscale" : tensor<f32>
    %52 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_k.q_input3Arscale = util.global.load @"__auto.blk.2.attn_k.q_input:rscale" : tensor<f32>
    %53 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_k.weight3Aqs = util.global.load @"__auto.blk.2.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %54 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_k.q_output3Arscale = util.global.load @"__auto.blk.2.attn_k.q_output:rscale" : tensor<f32>
    %55 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_v.q_input3Arscale = util.global.load @"__auto.blk.2.attn_v.q_input:rscale" : tensor<f32>
    %56 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_v.weight3Aqs = util.global.load @"__auto.blk.2.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %57 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_v.q_output3Arscale = util.global.load @"__auto.blk.2.attn_v.q_output:rscale" : tensor<f32>
    %58 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %59 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.2.attn_output.q_input3Arscale = util.global.load @"__auto.blk.2.attn_output.q_input:rscale" : tensor<f32>
    %60 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_output.weight3Aqs = util.global.load @"__auto.blk.2.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %61 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_output.weight3Ad = util.global.load @"__auto.blk.2.attn_output.weight:d" : tensor<f32>
    %62 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xbf16>
    %63 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.2.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_gate.q_input:rscale" : tensor<f32>
    %64 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.2.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %65 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_gate.weight3Ad = util.global.load @"__auto.blk.2.ffn_gate.weight:d" : tensor<f32>
    %66 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_up.q_input:rscale" : tensor<f32>
    %67 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_up.weight3Aqs = util.global.load @"__auto.blk.2.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %68 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_up.weight3Ad = util.global.load @"__auto.blk.2.ffn_up.weight:d" : tensor<f32>
    %69 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_down.q_input:rscale" : tensor<f32>
    %70 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_down.weight3Aqs = util.global.load @"__auto.blk.2.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %71 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.2.ffn_down.weight3Ad = util.global.load @"__auto.blk.2.ffn_down.weight:d" : tensor<f32>
    %72 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xbf16>
    %73 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.3.attn_q.q_input3Arscale = util.global.load @"__auto.blk.3.attn_q.q_input:rscale" : tensor<f32>
    %74 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_q.weight3Aqs = util.global.load @"__auto.blk.3.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %75 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_q.q_output3Arscale = util.global.load @"__auto.blk.3.attn_q.q_output:rscale" : tensor<f32>
    %76 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_k.q_input3Arscale = util.global.load @"__auto.blk.3.attn_k.q_input:rscale" : tensor<f32>
    %77 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_k.weight3Aqs = util.global.load @"__auto.blk.3.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %78 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_k.q_output3Arscale = util.global.load @"__auto.blk.3.attn_k.q_output:rscale" : tensor<f32>
    %79 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_v.q_input3Arscale = util.global.load @"__auto.blk.3.attn_v.q_input:rscale" : tensor<f32>
    %80 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_v.weight3Aqs = util.global.load @"__auto.blk.3.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %81 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_v.q_output3Arscale = util.global.load @"__auto.blk.3.attn_v.q_output:rscale" : tensor<f32>
    %82 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %83 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.3.attn_output.q_input3Arscale = util.global.load @"__auto.blk.3.attn_output.q_input:rscale" : tensor<f32>
    %84 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_output.weight3Aqs = util.global.load @"__auto.blk.3.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %85 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_output.weight3Ad = util.global.load @"__auto.blk.3.attn_output.weight:d" : tensor<f32>
    %86 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xbf16>
    %87 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.3.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_gate.q_input:rscale" : tensor<f32>
    %88 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.3.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %89 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_gate.weight3Ad = util.global.load @"__auto.blk.3.ffn_gate.weight:d" : tensor<f32>
    %90 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_up.q_input:rscale" : tensor<f32>
    %91 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_up.weight3Aqs = util.global.load @"__auto.blk.3.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %92 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_up.weight3Ad = util.global.load @"__auto.blk.3.ffn_up.weight:d" : tensor<f32>
    %93 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_down.q_input:rscale" : tensor<f32>
    %94 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_down.weight3Aqs = util.global.load @"__auto.blk.3.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %95 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.3.ffn_down.weight3Ad = util.global.load @"__auto.blk.3.ffn_down.weight:d" : tensor<f32>
    %96 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xbf16>
    %97 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.4.attn_q.q_input3Arscale = util.global.load @"__auto.blk.4.attn_q.q_input:rscale" : tensor<f32>
    %98 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_q.weight3Aqs = util.global.load @"__auto.blk.4.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %99 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_q.q_output3Arscale = util.global.load @"__auto.blk.4.attn_q.q_output:rscale" : tensor<f32>
    %100 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_k.q_input3Arscale = util.global.load @"__auto.blk.4.attn_k.q_input:rscale" : tensor<f32>
    %101 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_k.weight3Aqs = util.global.load @"__auto.blk.4.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %102 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_k.q_output3Arscale = util.global.load @"__auto.blk.4.attn_k.q_output:rscale" : tensor<f32>
    %103 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_v.q_input3Arscale = util.global.load @"__auto.blk.4.attn_v.q_input:rscale" : tensor<f32>
    %104 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_v.weight3Aqs = util.global.load @"__auto.blk.4.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %105 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_v.q_output3Arscale = util.global.load @"__auto.blk.4.attn_v.q_output:rscale" : tensor<f32>
    %106 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %107 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.4.attn_output.q_input3Arscale = util.global.load @"__auto.blk.4.attn_output.q_input:rscale" : tensor<f32>
    %108 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_output.weight3Aqs = util.global.load @"__auto.blk.4.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %109 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_output.weight3Ad = util.global.load @"__auto.blk.4.attn_output.weight:d" : tensor<f32>
    %110 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xbf16>
    %111 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.4.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_gate.q_input:rscale" : tensor<f32>
    %112 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.4.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %113 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_gate.weight3Ad = util.global.load @"__auto.blk.4.ffn_gate.weight:d" : tensor<f32>
    %114 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_up.q_input:rscale" : tensor<f32>
    %115 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_up.weight3Aqs = util.global.load @"__auto.blk.4.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %116 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_up.weight3Ad = util.global.load @"__auto.blk.4.ffn_up.weight:d" : tensor<f32>
    %117 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_down.q_input:rscale" : tensor<f32>
    %118 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_down.weight3Aqs = util.global.load @"__auto.blk.4.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %119 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.4.ffn_down.weight3Ad = util.global.load @"__auto.blk.4.ffn_down.weight:d" : tensor<f32>
    %120 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xbf16>
    %121 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.5.attn_q.q_input3Arscale = util.global.load @"__auto.blk.5.attn_q.q_input:rscale" : tensor<f32>
    %122 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_q.weight3Aqs = util.global.load @"__auto.blk.5.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %123 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_q.q_output3Arscale = util.global.load @"__auto.blk.5.attn_q.q_output:rscale" : tensor<f32>
    %124 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_k.q_input3Arscale = util.global.load @"__auto.blk.5.attn_k.q_input:rscale" : tensor<f32>
    %125 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_k.weight3Aqs = util.global.load @"__auto.blk.5.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %126 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_k.q_output3Arscale = util.global.load @"__auto.blk.5.attn_k.q_output:rscale" : tensor<f32>
    %127 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_v.q_input3Arscale = util.global.load @"__auto.blk.5.attn_v.q_input:rscale" : tensor<f32>
    %128 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_v.weight3Aqs = util.global.load @"__auto.blk.5.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %129 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_v.q_output3Arscale = util.global.load @"__auto.blk.5.attn_v.q_output:rscale" : tensor<f32>
    %130 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %131 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.5.attn_output.q_input3Arscale = util.global.load @"__auto.blk.5.attn_output.q_input:rscale" : tensor<f32>
    %132 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_output.weight3Aqs = util.global.load @"__auto.blk.5.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %133 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_output.weight3Ad = util.global.load @"__auto.blk.5.attn_output.weight:d" : tensor<f32>
    %134 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xbf16>
    %135 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.5.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_gate.q_input:rscale" : tensor<f32>
    %136 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.5.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %137 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_gate.weight3Ad = util.global.load @"__auto.blk.5.ffn_gate.weight:d" : tensor<f32>
    %138 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_up.q_input:rscale" : tensor<f32>
    %139 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_up.weight3Aqs = util.global.load @"__auto.blk.5.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %140 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_up.weight3Ad = util.global.load @"__auto.blk.5.ffn_up.weight:d" : tensor<f32>
    %141 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_down.q_input:rscale" : tensor<f32>
    %142 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_down.weight3Aqs = util.global.load @"__auto.blk.5.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %143 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.5.ffn_down.weight3Ad = util.global.load @"__auto.blk.5.ffn_down.weight:d" : tensor<f32>
    %144 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xbf16>
    %145 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.6.attn_q.q_input3Arscale = util.global.load @"__auto.blk.6.attn_q.q_input:rscale" : tensor<f32>
    %146 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_q.weight3Aqs = util.global.load @"__auto.blk.6.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %147 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_q.q_output3Arscale = util.global.load @"__auto.blk.6.attn_q.q_output:rscale" : tensor<f32>
    %148 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_k.q_input3Arscale = util.global.load @"__auto.blk.6.attn_k.q_input:rscale" : tensor<f32>
    %149 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_k.weight3Aqs = util.global.load @"__auto.blk.6.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %150 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_k.q_output3Arscale = util.global.load @"__auto.blk.6.attn_k.q_output:rscale" : tensor<f32>
    %151 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_v.q_input3Arscale = util.global.load @"__auto.blk.6.attn_v.q_input:rscale" : tensor<f32>
    %152 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_v.weight3Aqs = util.global.load @"__auto.blk.6.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %153 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_v.q_output3Arscale = util.global.load @"__auto.blk.6.attn_v.q_output:rscale" : tensor<f32>
    %154 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %155 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.6.attn_output.q_input3Arscale = util.global.load @"__auto.blk.6.attn_output.q_input:rscale" : tensor<f32>
    %156 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_output.weight3Aqs = util.global.load @"__auto.blk.6.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %157 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_output.weight3Ad = util.global.load @"__auto.blk.6.attn_output.weight:d" : tensor<f32>
    %158 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xbf16>
    %159 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.6.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_gate.q_input:rscale" : tensor<f32>
    %160 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.6.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %161 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_gate.weight3Ad = util.global.load @"__auto.blk.6.ffn_gate.weight:d" : tensor<f32>
    %162 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_up.q_input:rscale" : tensor<f32>
    %163 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_up.weight3Aqs = util.global.load @"__auto.blk.6.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %164 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_up.weight3Ad = util.global.load @"__auto.blk.6.ffn_up.weight:d" : tensor<f32>
    %165 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_down.q_input:rscale" : tensor<f32>
    %166 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_down.weight3Aqs = util.global.load @"__auto.blk.6.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %167 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.6.ffn_down.weight3Ad = util.global.load @"__auto.blk.6.ffn_down.weight:d" : tensor<f32>
    %168 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xbf16>
    %169 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.7.attn_q.q_input3Arscale = util.global.load @"__auto.blk.7.attn_q.q_input:rscale" : tensor<f32>
    %170 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_q.weight3Aqs = util.global.load @"__auto.blk.7.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %171 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_q.q_output3Arscale = util.global.load @"__auto.blk.7.attn_q.q_output:rscale" : tensor<f32>
    %172 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_k.q_input3Arscale = util.global.load @"__auto.blk.7.attn_k.q_input:rscale" : tensor<f32>
    %173 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_k.weight3Aqs = util.global.load @"__auto.blk.7.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %174 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_k.q_output3Arscale = util.global.load @"__auto.blk.7.attn_k.q_output:rscale" : tensor<f32>
    %175 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_v.q_input3Arscale = util.global.load @"__auto.blk.7.attn_v.q_input:rscale" : tensor<f32>
    %176 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_v.weight3Aqs = util.global.load @"__auto.blk.7.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %177 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_v.q_output3Arscale = util.global.load @"__auto.blk.7.attn_v.q_output:rscale" : tensor<f32>
    %178 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %179 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.7.attn_output.q_input3Arscale = util.global.load @"__auto.blk.7.attn_output.q_input:rscale" : tensor<f32>
    %180 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_output.weight3Aqs = util.global.load @"__auto.blk.7.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %181 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_output.weight3Ad = util.global.load @"__auto.blk.7.attn_output.weight:d" : tensor<f32>
    %182 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xbf16>
    %183 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.7.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_gate.q_input:rscale" : tensor<f32>
    %184 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.7.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %185 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_gate.weight3Ad = util.global.load @"__auto.blk.7.ffn_gate.weight:d" : tensor<f32>
    %186 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_up.q_input:rscale" : tensor<f32>
    %187 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_up.weight3Aqs = util.global.load @"__auto.blk.7.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %188 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_up.weight3Ad = util.global.load @"__auto.blk.7.ffn_up.weight:d" : tensor<f32>
    %189 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_down.q_input:rscale" : tensor<f32>
    %190 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_down.weight3Aqs = util.global.load @"__auto.blk.7.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %191 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.7.ffn_down.weight3Ad = util.global.load @"__auto.blk.7.ffn_down.weight:d" : tensor<f32>
    %192 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xbf16>
    %193 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.8.attn_q.q_input3Arscale = util.global.load @"__auto.blk.8.attn_q.q_input:rscale" : tensor<f32>
    %194 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_q.weight3Aqs = util.global.load @"__auto.blk.8.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %195 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_q.q_output3Arscale = util.global.load @"__auto.blk.8.attn_q.q_output:rscale" : tensor<f32>
    %196 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_k.q_input3Arscale = util.global.load @"__auto.blk.8.attn_k.q_input:rscale" : tensor<f32>
    %197 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_k.weight3Aqs = util.global.load @"__auto.blk.8.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %198 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_k.q_output3Arscale = util.global.load @"__auto.blk.8.attn_k.q_output:rscale" : tensor<f32>
    %199 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_v.q_input3Arscale = util.global.load @"__auto.blk.8.attn_v.q_input:rscale" : tensor<f32>
    %200 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_v.weight3Aqs = util.global.load @"__auto.blk.8.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %201 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_v.q_output3Arscale = util.global.load @"__auto.blk.8.attn_v.q_output:rscale" : tensor<f32>
    %202 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %203 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.8.attn_output.q_input3Arscale = util.global.load @"__auto.blk.8.attn_output.q_input:rscale" : tensor<f32>
    %204 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_output.weight3Aqs = util.global.load @"__auto.blk.8.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %205 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_output.weight3Ad = util.global.load @"__auto.blk.8.attn_output.weight:d" : tensor<f32>
    %206 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xbf16>
    %207 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.8.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_gate.q_input:rscale" : tensor<f32>
    %208 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.8.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %209 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_gate.weight3Ad = util.global.load @"__auto.blk.8.ffn_gate.weight:d" : tensor<f32>
    %210 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_up.q_input:rscale" : tensor<f32>
    %211 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_up.weight3Aqs = util.global.load @"__auto.blk.8.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %212 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_up.weight3Ad = util.global.load @"__auto.blk.8.ffn_up.weight:d" : tensor<f32>
    %213 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_down.q_input:rscale" : tensor<f32>
    %214 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_down.weight3Aqs = util.global.load @"__auto.blk.8.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %215 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.8.ffn_down.weight3Ad = util.global.load @"__auto.blk.8.ffn_down.weight:d" : tensor<f32>
    %216 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xbf16>
    %217 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.9.attn_q.q_input3Arscale = util.global.load @"__auto.blk.9.attn_q.q_input:rscale" : tensor<f32>
    %218 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_q.weight3Aqs = util.global.load @"__auto.blk.9.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %219 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_q.q_output3Arscale = util.global.load @"__auto.blk.9.attn_q.q_output:rscale" : tensor<f32>
    %220 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_k.q_input3Arscale = util.global.load @"__auto.blk.9.attn_k.q_input:rscale" : tensor<f32>
    %221 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_k.weight3Aqs = util.global.load @"__auto.blk.9.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %222 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_k.q_output3Arscale = util.global.load @"__auto.blk.9.attn_k.q_output:rscale" : tensor<f32>
    %223 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_v.q_input3Arscale = util.global.load @"__auto.blk.9.attn_v.q_input:rscale" : tensor<f32>
    %224 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_v.weight3Aqs = util.global.load @"__auto.blk.9.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %225 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_v.q_output3Arscale = util.global.load @"__auto.blk.9.attn_v.q_output:rscale" : tensor<f32>
    %226 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %227 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.9.attn_output.q_input3Arscale = util.global.load @"__auto.blk.9.attn_output.q_input:rscale" : tensor<f32>
    %228 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_output.weight3Aqs = util.global.load @"__auto.blk.9.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %229 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_output.weight3Ad = util.global.load @"__auto.blk.9.attn_output.weight:d" : tensor<f32>
    %230 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xbf16>
    %231 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.9.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_gate.q_input:rscale" : tensor<f32>
    %232 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.9.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %233 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_gate.weight3Ad = util.global.load @"__auto.blk.9.ffn_gate.weight:d" : tensor<f32>
    %234 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_up.q_input:rscale" : tensor<f32>
    %235 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_up.weight3Aqs = util.global.load @"__auto.blk.9.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %236 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_up.weight3Ad = util.global.load @"__auto.blk.9.ffn_up.weight:d" : tensor<f32>
    %237 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_down.q_input:rscale" : tensor<f32>
    %238 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_down.weight3Aqs = util.global.load @"__auto.blk.9.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %239 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.9.ffn_down.weight3Ad = util.global.load @"__auto.blk.9.ffn_down.weight:d" : tensor<f32>
    %240 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xbf16>
    %241 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.10.attn_q.q_input3Arscale = util.global.load @"__auto.blk.10.attn_q.q_input:rscale" : tensor<f32>
    %242 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_q.weight3Aqs = util.global.load @"__auto.blk.10.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %243 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_q.q_output3Arscale = util.global.load @"__auto.blk.10.attn_q.q_output:rscale" : tensor<f32>
    %244 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_k.q_input3Arscale = util.global.load @"__auto.blk.10.attn_k.q_input:rscale" : tensor<f32>
    %245 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_k.weight3Aqs = util.global.load @"__auto.blk.10.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %246 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_k.q_output3Arscale = util.global.load @"__auto.blk.10.attn_k.q_output:rscale" : tensor<f32>
    %247 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_v.q_input3Arscale = util.global.load @"__auto.blk.10.attn_v.q_input:rscale" : tensor<f32>
    %248 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_v.weight3Aqs = util.global.load @"__auto.blk.10.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %249 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_v.q_output3Arscale = util.global.load @"__auto.blk.10.attn_v.q_output:rscale" : tensor<f32>
    %250 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %251 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.10.attn_output.q_input3Arscale = util.global.load @"__auto.blk.10.attn_output.q_input:rscale" : tensor<f32>
    %252 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_output.weight3Aqs = util.global.load @"__auto.blk.10.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %253 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_output.weight3Ad = util.global.load @"__auto.blk.10.attn_output.weight:d" : tensor<f32>
    %254 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xbf16>
    %255 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.10.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_gate.q_input:rscale" : tensor<f32>
    %256 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.10.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %257 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_gate.weight3Ad = util.global.load @"__auto.blk.10.ffn_gate.weight:d" : tensor<f32>
    %258 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_up.q_input:rscale" : tensor<f32>
    %259 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_up.weight3Aqs = util.global.load @"__auto.blk.10.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %260 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_up.weight3Ad = util.global.load @"__auto.blk.10.ffn_up.weight:d" : tensor<f32>
    %261 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_down.q_input:rscale" : tensor<f32>
    %262 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_down.weight3Aqs = util.global.load @"__auto.blk.10.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %263 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.10.ffn_down.weight3Ad = util.global.load @"__auto.blk.10.ffn_down.weight:d" : tensor<f32>
    %264 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xbf16>
    %265 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.11.attn_q.q_input3Arscale = util.global.load @"__auto.blk.11.attn_q.q_input:rscale" : tensor<f32>
    %266 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_q.weight3Aqs = util.global.load @"__auto.blk.11.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %267 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_q.q_output3Arscale = util.global.load @"__auto.blk.11.attn_q.q_output:rscale" : tensor<f32>
    %268 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_k.q_input3Arscale = util.global.load @"__auto.blk.11.attn_k.q_input:rscale" : tensor<f32>
    %269 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_k.weight3Aqs = util.global.load @"__auto.blk.11.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %270 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_k.q_output3Arscale = util.global.load @"__auto.blk.11.attn_k.q_output:rscale" : tensor<f32>
    %271 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_v.q_input3Arscale = util.global.load @"__auto.blk.11.attn_v.q_input:rscale" : tensor<f32>
    %272 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_v.weight3Aqs = util.global.load @"__auto.blk.11.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %273 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_v.q_output3Arscale = util.global.load @"__auto.blk.11.attn_v.q_output:rscale" : tensor<f32>
    %274 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %275 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.11.attn_output.q_input3Arscale = util.global.load @"__auto.blk.11.attn_output.q_input:rscale" : tensor<f32>
    %276 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_output.weight3Aqs = util.global.load @"__auto.blk.11.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %277 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_output.weight3Ad = util.global.load @"__auto.blk.11.attn_output.weight:d" : tensor<f32>
    %278 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xbf16>
    %279 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.11.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_gate.q_input:rscale" : tensor<f32>
    %280 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.11.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %281 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_gate.weight3Ad = util.global.load @"__auto.blk.11.ffn_gate.weight:d" : tensor<f32>
    %282 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_up.q_input:rscale" : tensor<f32>
    %283 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_up.weight3Aqs = util.global.load @"__auto.blk.11.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %284 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_up.weight3Ad = util.global.load @"__auto.blk.11.ffn_up.weight:d" : tensor<f32>
    %285 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_down.q_input:rscale" : tensor<f32>
    %286 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_down.weight3Aqs = util.global.load @"__auto.blk.11.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %287 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.11.ffn_down.weight3Ad = util.global.load @"__auto.blk.11.ffn_down.weight:d" : tensor<f32>
    %288 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xbf16>
    %289 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.12.attn_q.q_input3Arscale = util.global.load @"__auto.blk.12.attn_q.q_input:rscale" : tensor<f32>
    %290 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_q.weight3Aqs = util.global.load @"__auto.blk.12.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %291 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_q.q_output3Arscale = util.global.load @"__auto.blk.12.attn_q.q_output:rscale" : tensor<f32>
    %292 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_k.q_input3Arscale = util.global.load @"__auto.blk.12.attn_k.q_input:rscale" : tensor<f32>
    %293 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_k.weight3Aqs = util.global.load @"__auto.blk.12.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %294 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_k.q_output3Arscale = util.global.load @"__auto.blk.12.attn_k.q_output:rscale" : tensor<f32>
    %295 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_v.q_input3Arscale = util.global.load @"__auto.blk.12.attn_v.q_input:rscale" : tensor<f32>
    %296 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_v.weight3Aqs = util.global.load @"__auto.blk.12.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %297 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_v.q_output3Arscale = util.global.load @"__auto.blk.12.attn_v.q_output:rscale" : tensor<f32>
    %298 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %299 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.12.attn_output.q_input3Arscale = util.global.load @"__auto.blk.12.attn_output.q_input:rscale" : tensor<f32>
    %300 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_output.weight3Aqs = util.global.load @"__auto.blk.12.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %301 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_output.weight3Ad = util.global.load @"__auto.blk.12.attn_output.weight:d" : tensor<f32>
    %302 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xbf16>
    %303 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.12.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_gate.q_input:rscale" : tensor<f32>
    %304 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.12.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %305 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_gate.weight3Ad = util.global.load @"__auto.blk.12.ffn_gate.weight:d" : tensor<f32>
    %306 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_up.q_input:rscale" : tensor<f32>
    %307 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_up.weight3Aqs = util.global.load @"__auto.blk.12.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %308 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_up.weight3Ad = util.global.load @"__auto.blk.12.ffn_up.weight:d" : tensor<f32>
    %309 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_down.q_input:rscale" : tensor<f32>
    %310 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_down.weight3Aqs = util.global.load @"__auto.blk.12.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %311 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.12.ffn_down.weight3Ad = util.global.load @"__auto.blk.12.ffn_down.weight:d" : tensor<f32>
    %312 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xbf16>
    %313 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.13.attn_q.q_input3Arscale = util.global.load @"__auto.blk.13.attn_q.q_input:rscale" : tensor<f32>
    %314 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_q.weight3Aqs = util.global.load @"__auto.blk.13.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %315 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_q.q_output3Arscale = util.global.load @"__auto.blk.13.attn_q.q_output:rscale" : tensor<f32>
    %316 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_k.q_input3Arscale = util.global.load @"__auto.blk.13.attn_k.q_input:rscale" : tensor<f32>
    %317 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_k.weight3Aqs = util.global.load @"__auto.blk.13.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %318 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_k.q_output3Arscale = util.global.load @"__auto.blk.13.attn_k.q_output:rscale" : tensor<f32>
    %319 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_v.q_input3Arscale = util.global.load @"__auto.blk.13.attn_v.q_input:rscale" : tensor<f32>
    %320 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_v.weight3Aqs = util.global.load @"__auto.blk.13.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %321 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_v.q_output3Arscale = util.global.load @"__auto.blk.13.attn_v.q_output:rscale" : tensor<f32>
    %322 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %323 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.13.attn_output.q_input3Arscale = util.global.load @"__auto.blk.13.attn_output.q_input:rscale" : tensor<f32>
    %324 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_output.weight3Aqs = util.global.load @"__auto.blk.13.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %325 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_output.weight3Ad = util.global.load @"__auto.blk.13.attn_output.weight:d" : tensor<f32>
    %326 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xbf16>
    %327 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.13.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_gate.q_input:rscale" : tensor<f32>
    %328 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.13.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %329 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_gate.weight3Ad = util.global.load @"__auto.blk.13.ffn_gate.weight:d" : tensor<f32>
    %330 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_up.q_input:rscale" : tensor<f32>
    %331 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_up.weight3Aqs = util.global.load @"__auto.blk.13.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %332 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_up.weight3Ad = util.global.load @"__auto.blk.13.ffn_up.weight:d" : tensor<f32>
    %333 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_down.q_input:rscale" : tensor<f32>
    %334 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_down.weight3Aqs = util.global.load @"__auto.blk.13.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %335 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.13.ffn_down.weight3Ad = util.global.load @"__auto.blk.13.ffn_down.weight:d" : tensor<f32>
    %336 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xbf16>
    %337 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.14.attn_q.q_input3Arscale = util.global.load @"__auto.blk.14.attn_q.q_input:rscale" : tensor<f32>
    %338 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_q.weight3Aqs = util.global.load @"__auto.blk.14.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %339 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_q.q_output3Arscale = util.global.load @"__auto.blk.14.attn_q.q_output:rscale" : tensor<f32>
    %340 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_k.q_input3Arscale = util.global.load @"__auto.blk.14.attn_k.q_input:rscale" : tensor<f32>
    %341 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_k.weight3Aqs = util.global.load @"__auto.blk.14.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %342 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_k.q_output3Arscale = util.global.load @"__auto.blk.14.attn_k.q_output:rscale" : tensor<f32>
    %343 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_v.q_input3Arscale = util.global.load @"__auto.blk.14.attn_v.q_input:rscale" : tensor<f32>
    %344 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_v.weight3Aqs = util.global.load @"__auto.blk.14.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %345 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_v.q_output3Arscale = util.global.load @"__auto.blk.14.attn_v.q_output:rscale" : tensor<f32>
    %346 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %347 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.14.attn_output.q_input3Arscale = util.global.load @"__auto.blk.14.attn_output.q_input:rscale" : tensor<f32>
    %348 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_output.weight3Aqs = util.global.load @"__auto.blk.14.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %349 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_output.weight3Ad = util.global.load @"__auto.blk.14.attn_output.weight:d" : tensor<f32>
    %350 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xbf16>
    %351 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.14.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_gate.q_input:rscale" : tensor<f32>
    %352 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.14.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %353 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_gate.weight3Ad = util.global.load @"__auto.blk.14.ffn_gate.weight:d" : tensor<f32>
    %354 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_up.q_input:rscale" : tensor<f32>
    %355 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_up.weight3Aqs = util.global.load @"__auto.blk.14.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %356 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_up.weight3Ad = util.global.load @"__auto.blk.14.ffn_up.weight:d" : tensor<f32>
    %357 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_down.q_input:rscale" : tensor<f32>
    %358 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_down.weight3Aqs = util.global.load @"__auto.blk.14.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %359 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.14.ffn_down.weight3Ad = util.global.load @"__auto.blk.14.ffn_down.weight:d" : tensor<f32>
    %360 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xbf16>
    %361 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.15.attn_q.q_input3Arscale = util.global.load @"__auto.blk.15.attn_q.q_input:rscale" : tensor<f32>
    %362 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_q.weight3Aqs = util.global.load @"__auto.blk.15.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %363 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_q.q_output3Arscale = util.global.load @"__auto.blk.15.attn_q.q_output:rscale" : tensor<f32>
    %364 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_k.q_input3Arscale = util.global.load @"__auto.blk.15.attn_k.q_input:rscale" : tensor<f32>
    %365 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_k.weight3Aqs = util.global.load @"__auto.blk.15.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %366 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_k.q_output3Arscale = util.global.load @"__auto.blk.15.attn_k.q_output:rscale" : tensor<f32>
    %367 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_v.q_input3Arscale = util.global.load @"__auto.blk.15.attn_v.q_input:rscale" : tensor<f32>
    %368 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_v.weight3Aqs = util.global.load @"__auto.blk.15.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %369 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_v.q_output3Arscale = util.global.load @"__auto.blk.15.attn_v.q_output:rscale" : tensor<f32>
    %370 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %371 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.15.attn_output.q_input3Arscale = util.global.load @"__auto.blk.15.attn_output.q_input:rscale" : tensor<f32>
    %372 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_output.weight3Aqs = util.global.load @"__auto.blk.15.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %373 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_output.weight3Ad = util.global.load @"__auto.blk.15.attn_output.weight:d" : tensor<f32>
    %374 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xbf16>
    %375 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.15.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_gate.q_input:rscale" : tensor<f32>
    %376 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.15.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %377 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_gate.weight3Ad = util.global.load @"__auto.blk.15.ffn_gate.weight:d" : tensor<f32>
    %378 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_up.q_input:rscale" : tensor<f32>
    %379 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_up.weight3Aqs = util.global.load @"__auto.blk.15.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %380 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_up.weight3Ad = util.global.load @"__auto.blk.15.ffn_up.weight:d" : tensor<f32>
    %381 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_down.q_input:rscale" : tensor<f32>
    %382 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_down.weight3Aqs = util.global.load @"__auto.blk.15.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %383 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.15.ffn_down.weight3Ad = util.global.load @"__auto.blk.15.ffn_down.weight:d" : tensor<f32>
    %384 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xbf16>
    %385 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.16.attn_q.q_input3Arscale = util.global.load @"__auto.blk.16.attn_q.q_input:rscale" : tensor<f32>
    %386 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_q.weight3Aqs = util.global.load @"__auto.blk.16.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %387 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_q.q_output3Arscale = util.global.load @"__auto.blk.16.attn_q.q_output:rscale" : tensor<f32>
    %388 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_k.q_input3Arscale = util.global.load @"__auto.blk.16.attn_k.q_input:rscale" : tensor<f32>
    %389 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_k.weight3Aqs = util.global.load @"__auto.blk.16.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %390 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_k.q_output3Arscale = util.global.load @"__auto.blk.16.attn_k.q_output:rscale" : tensor<f32>
    %391 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_v.q_input3Arscale = util.global.load @"__auto.blk.16.attn_v.q_input:rscale" : tensor<f32>
    %392 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_v.weight3Aqs = util.global.load @"__auto.blk.16.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %393 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_v.q_output3Arscale = util.global.load @"__auto.blk.16.attn_v.q_output:rscale" : tensor<f32>
    %394 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %395 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.16.attn_output.q_input3Arscale = util.global.load @"__auto.blk.16.attn_output.q_input:rscale" : tensor<f32>
    %396 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_output.weight3Aqs = util.global.load @"__auto.blk.16.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %397 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_output.weight3Ad = util.global.load @"__auto.blk.16.attn_output.weight:d" : tensor<f32>
    %398 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xbf16>
    %399 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.16.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_gate.q_input:rscale" : tensor<f32>
    %400 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.16.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %401 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_gate.weight3Ad = util.global.load @"__auto.blk.16.ffn_gate.weight:d" : tensor<f32>
    %402 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_up.q_input:rscale" : tensor<f32>
    %403 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_up.weight3Aqs = util.global.load @"__auto.blk.16.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %404 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_up.weight3Ad = util.global.load @"__auto.blk.16.ffn_up.weight:d" : tensor<f32>
    %405 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_down.q_input:rscale" : tensor<f32>
    %406 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_down.weight3Aqs = util.global.load @"__auto.blk.16.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %407 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.16.ffn_down.weight3Ad = util.global.load @"__auto.blk.16.ffn_down.weight:d" : tensor<f32>
    %408 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xbf16>
    %409 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.17.attn_q.q_input3Arscale = util.global.load @"__auto.blk.17.attn_q.q_input:rscale" : tensor<f32>
    %410 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_q.weight3Aqs = util.global.load @"__auto.blk.17.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %411 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_q.q_output3Arscale = util.global.load @"__auto.blk.17.attn_q.q_output:rscale" : tensor<f32>
    %412 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_k.q_input3Arscale = util.global.load @"__auto.blk.17.attn_k.q_input:rscale" : tensor<f32>
    %413 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_k.weight3Aqs = util.global.load @"__auto.blk.17.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %414 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_k.q_output3Arscale = util.global.load @"__auto.blk.17.attn_k.q_output:rscale" : tensor<f32>
    %415 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_v.q_input3Arscale = util.global.load @"__auto.blk.17.attn_v.q_input:rscale" : tensor<f32>
    %416 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_v.weight3Aqs = util.global.load @"__auto.blk.17.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %417 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_v.q_output3Arscale = util.global.load @"__auto.blk.17.attn_v.q_output:rscale" : tensor<f32>
    %418 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %419 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.17.attn_output.q_input3Arscale = util.global.load @"__auto.blk.17.attn_output.q_input:rscale" : tensor<f32>
    %420 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_output.weight3Aqs = util.global.load @"__auto.blk.17.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %421 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_output.weight3Ad = util.global.load @"__auto.blk.17.attn_output.weight:d" : tensor<f32>
    %422 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xbf16>
    %423 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.17.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_gate.q_input:rscale" : tensor<f32>
    %424 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.17.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %425 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_gate.weight3Ad = util.global.load @"__auto.blk.17.ffn_gate.weight:d" : tensor<f32>
    %426 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_up.q_input:rscale" : tensor<f32>
    %427 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_up.weight3Aqs = util.global.load @"__auto.blk.17.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %428 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_up.weight3Ad = util.global.load @"__auto.blk.17.ffn_up.weight:d" : tensor<f32>
    %429 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_down.q_input:rscale" : tensor<f32>
    %430 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_down.weight3Aqs = util.global.load @"__auto.blk.17.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %431 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.17.ffn_down.weight3Ad = util.global.load @"__auto.blk.17.ffn_down.weight:d" : tensor<f32>
    %432 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xbf16>
    %433 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.18.attn_q.q_input3Arscale = util.global.load @"__auto.blk.18.attn_q.q_input:rscale" : tensor<f32>
    %434 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_q.weight3Aqs = util.global.load @"__auto.blk.18.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %435 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_q.q_output3Arscale = util.global.load @"__auto.blk.18.attn_q.q_output:rscale" : tensor<f32>
    %436 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_k.q_input3Arscale = util.global.load @"__auto.blk.18.attn_k.q_input:rscale" : tensor<f32>
    %437 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_k.weight3Aqs = util.global.load @"__auto.blk.18.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %438 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_k.q_output3Arscale = util.global.load @"__auto.blk.18.attn_k.q_output:rscale" : tensor<f32>
    %439 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_v.q_input3Arscale = util.global.load @"__auto.blk.18.attn_v.q_input:rscale" : tensor<f32>
    %440 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_v.weight3Aqs = util.global.load @"__auto.blk.18.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %441 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_v.q_output3Arscale = util.global.load @"__auto.blk.18.attn_v.q_output:rscale" : tensor<f32>
    %442 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %443 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.18.attn_output.q_input3Arscale = util.global.load @"__auto.blk.18.attn_output.q_input:rscale" : tensor<f32>
    %444 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_output.weight3Aqs = util.global.load @"__auto.blk.18.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %445 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_output.weight3Ad = util.global.load @"__auto.blk.18.attn_output.weight:d" : tensor<f32>
    %446 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xbf16>
    %447 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.18.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_gate.q_input:rscale" : tensor<f32>
    %448 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.18.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %449 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_gate.weight3Ad = util.global.load @"__auto.blk.18.ffn_gate.weight:d" : tensor<f32>
    %450 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_up.q_input:rscale" : tensor<f32>
    %451 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_up.weight3Aqs = util.global.load @"__auto.blk.18.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %452 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_up.weight3Ad = util.global.load @"__auto.blk.18.ffn_up.weight:d" : tensor<f32>
    %453 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_down.q_input:rscale" : tensor<f32>
    %454 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_down.weight3Aqs = util.global.load @"__auto.blk.18.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %455 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.18.ffn_down.weight3Ad = util.global.load @"__auto.blk.18.ffn_down.weight:d" : tensor<f32>
    %456 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xbf16>
    %457 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.19.attn_q.q_input3Arscale = util.global.load @"__auto.blk.19.attn_q.q_input:rscale" : tensor<f32>
    %458 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_q.weight3Aqs = util.global.load @"__auto.blk.19.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %459 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_q.q_output3Arscale = util.global.load @"__auto.blk.19.attn_q.q_output:rscale" : tensor<f32>
    %460 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_k.q_input3Arscale = util.global.load @"__auto.blk.19.attn_k.q_input:rscale" : tensor<f32>
    %461 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_k.weight3Aqs = util.global.load @"__auto.blk.19.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %462 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_k.q_output3Arscale = util.global.load @"__auto.blk.19.attn_k.q_output:rscale" : tensor<f32>
    %463 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_v.q_input3Arscale = util.global.load @"__auto.blk.19.attn_v.q_input:rscale" : tensor<f32>
    %464 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_v.weight3Aqs = util.global.load @"__auto.blk.19.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %465 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_v.q_output3Arscale = util.global.load @"__auto.blk.19.attn_v.q_output:rscale" : tensor<f32>
    %466 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %467 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.19.attn_output.q_input3Arscale = util.global.load @"__auto.blk.19.attn_output.q_input:rscale" : tensor<f32>
    %468 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_output.weight3Aqs = util.global.load @"__auto.blk.19.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %469 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_output.weight3Ad = util.global.load @"__auto.blk.19.attn_output.weight:d" : tensor<f32>
    %470 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xbf16>
    %471 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.19.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_gate.q_input:rscale" : tensor<f32>
    %472 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.19.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %473 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_gate.weight3Ad = util.global.load @"__auto.blk.19.ffn_gate.weight:d" : tensor<f32>
    %474 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_up.q_input:rscale" : tensor<f32>
    %475 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_up.weight3Aqs = util.global.load @"__auto.blk.19.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %476 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_up.weight3Ad = util.global.load @"__auto.blk.19.ffn_up.weight:d" : tensor<f32>
    %477 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_down.q_input:rscale" : tensor<f32>
    %478 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_down.weight3Aqs = util.global.load @"__auto.blk.19.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %479 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.19.ffn_down.weight3Ad = util.global.load @"__auto.blk.19.ffn_down.weight:d" : tensor<f32>
    %480 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xbf16>
    %481 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.20.attn_q.q_input3Arscale = util.global.load @"__auto.blk.20.attn_q.q_input:rscale" : tensor<f32>
    %482 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_q.weight3Aqs = util.global.load @"__auto.blk.20.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %483 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_q.q_output3Arscale = util.global.load @"__auto.blk.20.attn_q.q_output:rscale" : tensor<f32>
    %484 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_k.q_input3Arscale = util.global.load @"__auto.blk.20.attn_k.q_input:rscale" : tensor<f32>
    %485 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_k.weight3Aqs = util.global.load @"__auto.blk.20.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %486 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_k.q_output3Arscale = util.global.load @"__auto.blk.20.attn_k.q_output:rscale" : tensor<f32>
    %487 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_v.q_input3Arscale = util.global.load @"__auto.blk.20.attn_v.q_input:rscale" : tensor<f32>
    %488 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_v.weight3Aqs = util.global.load @"__auto.blk.20.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %489 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_v.q_output3Arscale = util.global.load @"__auto.blk.20.attn_v.q_output:rscale" : tensor<f32>
    %490 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %491 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.20.attn_output.q_input3Arscale = util.global.load @"__auto.blk.20.attn_output.q_input:rscale" : tensor<f32>
    %492 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_output.weight3Aqs = util.global.load @"__auto.blk.20.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %493 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_output.weight3Ad = util.global.load @"__auto.blk.20.attn_output.weight:d" : tensor<f32>
    %494 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xbf16>
    %495 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.20.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_gate.q_input:rscale" : tensor<f32>
    %496 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.20.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %497 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_gate.weight3Ad = util.global.load @"__auto.blk.20.ffn_gate.weight:d" : tensor<f32>
    %498 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_up.q_input:rscale" : tensor<f32>
    %499 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_up.weight3Aqs = util.global.load @"__auto.blk.20.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %500 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_up.weight3Ad = util.global.load @"__auto.blk.20.ffn_up.weight:d" : tensor<f32>
    %501 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_down.q_input:rscale" : tensor<f32>
    %502 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_down.weight3Aqs = util.global.load @"__auto.blk.20.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %503 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.20.ffn_down.weight3Ad = util.global.load @"__auto.blk.20.ffn_down.weight:d" : tensor<f32>
    %504 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xbf16>
    %505 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.21.attn_q.q_input3Arscale = util.global.load @"__auto.blk.21.attn_q.q_input:rscale" : tensor<f32>
    %506 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_q.weight3Aqs = util.global.load @"__auto.blk.21.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %507 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_q.q_output3Arscale = util.global.load @"__auto.blk.21.attn_q.q_output:rscale" : tensor<f32>
    %508 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_k.q_input3Arscale = util.global.load @"__auto.blk.21.attn_k.q_input:rscale" : tensor<f32>
    %509 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_k.weight3Aqs = util.global.load @"__auto.blk.21.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %510 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_k.q_output3Arscale = util.global.load @"__auto.blk.21.attn_k.q_output:rscale" : tensor<f32>
    %511 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_v.q_input3Arscale = util.global.load @"__auto.blk.21.attn_v.q_input:rscale" : tensor<f32>
    %512 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_v.weight3Aqs = util.global.load @"__auto.blk.21.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %513 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_v.q_output3Arscale = util.global.load @"__auto.blk.21.attn_v.q_output:rscale" : tensor<f32>
    %514 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %515 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.21.attn_output.q_input3Arscale = util.global.load @"__auto.blk.21.attn_output.q_input:rscale" : tensor<f32>
    %516 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_output.weight3Aqs = util.global.load @"__auto.blk.21.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %517 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_output.weight3Ad = util.global.load @"__auto.blk.21.attn_output.weight:d" : tensor<f32>
    %518 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xbf16>
    %519 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.21.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_gate.q_input:rscale" : tensor<f32>
    %520 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.21.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %521 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_gate.weight3Ad = util.global.load @"__auto.blk.21.ffn_gate.weight:d" : tensor<f32>
    %522 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_up.q_input:rscale" : tensor<f32>
    %523 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_up.weight3Aqs = util.global.load @"__auto.blk.21.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %524 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_up.weight3Ad = util.global.load @"__auto.blk.21.ffn_up.weight:d" : tensor<f32>
    %525 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_down.q_input:rscale" : tensor<f32>
    %526 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_down.weight3Aqs = util.global.load @"__auto.blk.21.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %527 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.21.ffn_down.weight3Ad = util.global.load @"__auto.blk.21.ffn_down.weight:d" : tensor<f32>
    %528 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xbf16>
    %529 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.22.attn_q.q_input3Arscale = util.global.load @"__auto.blk.22.attn_q.q_input:rscale" : tensor<f32>
    %530 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_q.weight3Aqs = util.global.load @"__auto.blk.22.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %531 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_q.q_output3Arscale = util.global.load @"__auto.blk.22.attn_q.q_output:rscale" : tensor<f32>
    %532 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_k.q_input3Arscale = util.global.load @"__auto.blk.22.attn_k.q_input:rscale" : tensor<f32>
    %533 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_k.weight3Aqs = util.global.load @"__auto.blk.22.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %534 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_k.q_output3Arscale = util.global.load @"__auto.blk.22.attn_k.q_output:rscale" : tensor<f32>
    %535 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_v.q_input3Arscale = util.global.load @"__auto.blk.22.attn_v.q_input:rscale" : tensor<f32>
    %536 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_v.weight3Aqs = util.global.load @"__auto.blk.22.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %537 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_v.q_output3Arscale = util.global.load @"__auto.blk.22.attn_v.q_output:rscale" : tensor<f32>
    %538 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %539 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.22.attn_output.q_input3Arscale = util.global.load @"__auto.blk.22.attn_output.q_input:rscale" : tensor<f32>
    %540 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_output.weight3Aqs = util.global.load @"__auto.blk.22.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %541 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_output.weight3Ad = util.global.load @"__auto.blk.22.attn_output.weight:d" : tensor<f32>
    %542 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xbf16>
    %543 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.22.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_gate.q_input:rscale" : tensor<f32>
    %544 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.22.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %545 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_gate.weight3Ad = util.global.load @"__auto.blk.22.ffn_gate.weight:d" : tensor<f32>
    %546 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_up.q_input:rscale" : tensor<f32>
    %547 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_up.weight3Aqs = util.global.load @"__auto.blk.22.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %548 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_up.weight3Ad = util.global.load @"__auto.blk.22.ffn_up.weight:d" : tensor<f32>
    %549 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_down.q_input:rscale" : tensor<f32>
    %550 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_down.weight3Aqs = util.global.load @"__auto.blk.22.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %551 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.22.ffn_down.weight3Ad = util.global.load @"__auto.blk.22.ffn_down.weight:d" : tensor<f32>
    %552 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xbf16>
    %553 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.23.attn_q.q_input3Arscale = util.global.load @"__auto.blk.23.attn_q.q_input:rscale" : tensor<f32>
    %554 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_q.weight3Aqs = util.global.load @"__auto.blk.23.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %555 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_q.q_output3Arscale = util.global.load @"__auto.blk.23.attn_q.q_output:rscale" : tensor<f32>
    %556 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_k.q_input3Arscale = util.global.load @"__auto.blk.23.attn_k.q_input:rscale" : tensor<f32>
    %557 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_k.weight3Aqs = util.global.load @"__auto.blk.23.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %558 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_k.q_output3Arscale = util.global.load @"__auto.blk.23.attn_k.q_output:rscale" : tensor<f32>
    %559 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_v.q_input3Arscale = util.global.load @"__auto.blk.23.attn_v.q_input:rscale" : tensor<f32>
    %560 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_v.weight3Aqs = util.global.load @"__auto.blk.23.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %561 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_v.q_output3Arscale = util.global.load @"__auto.blk.23.attn_v.q_output:rscale" : tensor<f32>
    %562 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %563 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.23.attn_output.q_input3Arscale = util.global.load @"__auto.blk.23.attn_output.q_input:rscale" : tensor<f32>
    %564 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_output.weight3Aqs = util.global.load @"__auto.blk.23.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %565 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_output.weight3Ad = util.global.load @"__auto.blk.23.attn_output.weight:d" : tensor<f32>
    %566 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xbf16>
    %567 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.23.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_gate.q_input:rscale" : tensor<f32>
    %568 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.23.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %569 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_gate.weight3Ad = util.global.load @"__auto.blk.23.ffn_gate.weight:d" : tensor<f32>
    %570 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_up.q_input:rscale" : tensor<f32>
    %571 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_up.weight3Aqs = util.global.load @"__auto.blk.23.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %572 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_up.weight3Ad = util.global.load @"__auto.blk.23.ffn_up.weight:d" : tensor<f32>
    %573 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_down.q_input:rscale" : tensor<f32>
    %574 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_down.weight3Aqs = util.global.load @"__auto.blk.23.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %575 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.23.ffn_down.weight3Ad = util.global.load @"__auto.blk.23.ffn_down.weight:d" : tensor<f32>
    %576 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xbf16>
    %577 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.24.attn_q.q_input3Arscale = util.global.load @"__auto.blk.24.attn_q.q_input:rscale" : tensor<f32>
    %578 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_q.weight3Aqs = util.global.load @"__auto.blk.24.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %579 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_q.q_output3Arscale = util.global.load @"__auto.blk.24.attn_q.q_output:rscale" : tensor<f32>
    %580 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_k.q_input3Arscale = util.global.load @"__auto.blk.24.attn_k.q_input:rscale" : tensor<f32>
    %581 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_k.weight3Aqs = util.global.load @"__auto.blk.24.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %582 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_k.q_output3Arscale = util.global.load @"__auto.blk.24.attn_k.q_output:rscale" : tensor<f32>
    %583 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_v.q_input3Arscale = util.global.load @"__auto.blk.24.attn_v.q_input:rscale" : tensor<f32>
    %584 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_v.weight3Aqs = util.global.load @"__auto.blk.24.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %585 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_v.q_output3Arscale = util.global.load @"__auto.blk.24.attn_v.q_output:rscale" : tensor<f32>
    %586 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %587 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.24.attn_output.q_input3Arscale = util.global.load @"__auto.blk.24.attn_output.q_input:rscale" : tensor<f32>
    %588 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_output.weight3Aqs = util.global.load @"__auto.blk.24.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %589 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_output.weight3Ad = util.global.load @"__auto.blk.24.attn_output.weight:d" : tensor<f32>
    %590 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xbf16>
    %591 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.24.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_gate.q_input:rscale" : tensor<f32>
    %592 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.24.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %593 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_gate.weight3Ad = util.global.load @"__auto.blk.24.ffn_gate.weight:d" : tensor<f32>
    %594 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_up.q_input:rscale" : tensor<f32>
    %595 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_up.weight3Aqs = util.global.load @"__auto.blk.24.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %596 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_up.weight3Ad = util.global.load @"__auto.blk.24.ffn_up.weight:d" : tensor<f32>
    %597 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_down.q_input:rscale" : tensor<f32>
    %598 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_down.weight3Aqs = util.global.load @"__auto.blk.24.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %599 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.24.ffn_down.weight3Ad = util.global.load @"__auto.blk.24.ffn_down.weight:d" : tensor<f32>
    %600 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xbf16>
    %601 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.25.attn_q.q_input3Arscale = util.global.load @"__auto.blk.25.attn_q.q_input:rscale" : tensor<f32>
    %602 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_q.weight3Aqs = util.global.load @"__auto.blk.25.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %603 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_q.q_output3Arscale = util.global.load @"__auto.blk.25.attn_q.q_output:rscale" : tensor<f32>
    %604 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_k.q_input3Arscale = util.global.load @"__auto.blk.25.attn_k.q_input:rscale" : tensor<f32>
    %605 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_k.weight3Aqs = util.global.load @"__auto.blk.25.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %606 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_k.q_output3Arscale = util.global.load @"__auto.blk.25.attn_k.q_output:rscale" : tensor<f32>
    %607 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_v.q_input3Arscale = util.global.load @"__auto.blk.25.attn_v.q_input:rscale" : tensor<f32>
    %608 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_v.weight3Aqs = util.global.load @"__auto.blk.25.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %609 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_v.q_output3Arscale = util.global.load @"__auto.blk.25.attn_v.q_output:rscale" : tensor<f32>
    %610 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %611 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.25.attn_output.q_input3Arscale = util.global.load @"__auto.blk.25.attn_output.q_input:rscale" : tensor<f32>
    %612 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_output.weight3Aqs = util.global.load @"__auto.blk.25.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %613 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_output.weight3Ad = util.global.load @"__auto.blk.25.attn_output.weight:d" : tensor<f32>
    %614 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xbf16>
    %615 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.25.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_gate.q_input:rscale" : tensor<f32>
    %616 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.25.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %617 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_gate.weight3Ad = util.global.load @"__auto.blk.25.ffn_gate.weight:d" : tensor<f32>
    %618 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_up.q_input:rscale" : tensor<f32>
    %619 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_up.weight3Aqs = util.global.load @"__auto.blk.25.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %620 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_up.weight3Ad = util.global.load @"__auto.blk.25.ffn_up.weight:d" : tensor<f32>
    %621 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_down.q_input:rscale" : tensor<f32>
    %622 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_down.weight3Aqs = util.global.load @"__auto.blk.25.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %623 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.25.ffn_down.weight3Ad = util.global.load @"__auto.blk.25.ffn_down.weight:d" : tensor<f32>
    %624 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xbf16>
    %625 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.26.attn_q.q_input3Arscale = util.global.load @"__auto.blk.26.attn_q.q_input:rscale" : tensor<f32>
    %626 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_q.weight3Aqs = util.global.load @"__auto.blk.26.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %627 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_q.q_output3Arscale = util.global.load @"__auto.blk.26.attn_q.q_output:rscale" : tensor<f32>
    %628 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_k.q_input3Arscale = util.global.load @"__auto.blk.26.attn_k.q_input:rscale" : tensor<f32>
    %629 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_k.weight3Aqs = util.global.load @"__auto.blk.26.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %630 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_k.q_output3Arscale = util.global.load @"__auto.blk.26.attn_k.q_output:rscale" : tensor<f32>
    %631 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_v.q_input3Arscale = util.global.load @"__auto.blk.26.attn_v.q_input:rscale" : tensor<f32>
    %632 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_v.weight3Aqs = util.global.load @"__auto.blk.26.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %633 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_v.q_output3Arscale = util.global.load @"__auto.blk.26.attn_v.q_output:rscale" : tensor<f32>
    %634 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %635 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.26.attn_output.q_input3Arscale = util.global.load @"__auto.blk.26.attn_output.q_input:rscale" : tensor<f32>
    %636 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_output.weight3Aqs = util.global.load @"__auto.blk.26.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %637 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_output.weight3Ad = util.global.load @"__auto.blk.26.attn_output.weight:d" : tensor<f32>
    %638 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xbf16>
    %639 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.26.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_gate.q_input:rscale" : tensor<f32>
    %640 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.26.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %641 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_gate.weight3Ad = util.global.load @"__auto.blk.26.ffn_gate.weight:d" : tensor<f32>
    %642 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_up.q_input:rscale" : tensor<f32>
    %643 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_up.weight3Aqs = util.global.load @"__auto.blk.26.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %644 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_up.weight3Ad = util.global.load @"__auto.blk.26.ffn_up.weight:d" : tensor<f32>
    %645 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_down.q_input:rscale" : tensor<f32>
    %646 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_down.weight3Aqs = util.global.load @"__auto.blk.26.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %647 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.26.ffn_down.weight3Ad = util.global.load @"__auto.blk.26.ffn_down.weight:d" : tensor<f32>
    %648 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xbf16>
    %649 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.27.attn_q.q_input3Arscale = util.global.load @"__auto.blk.27.attn_q.q_input:rscale" : tensor<f32>
    %650 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_q.weight3Aqs = util.global.load @"__auto.blk.27.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %651 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_q.q_output3Arscale = util.global.load @"__auto.blk.27.attn_q.q_output:rscale" : tensor<f32>
    %652 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_k.q_input3Arscale = util.global.load @"__auto.blk.27.attn_k.q_input:rscale" : tensor<f32>
    %653 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_k.weight3Aqs = util.global.load @"__auto.blk.27.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %654 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_k.q_output3Arscale = util.global.load @"__auto.blk.27.attn_k.q_output:rscale" : tensor<f32>
    %655 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_v.q_input3Arscale = util.global.load @"__auto.blk.27.attn_v.q_input:rscale" : tensor<f32>
    %656 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_v.weight3Aqs = util.global.load @"__auto.blk.27.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %657 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_v.q_output3Arscale = util.global.load @"__auto.blk.27.attn_v.q_output:rscale" : tensor<f32>
    %658 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %659 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.27.attn_output.q_input3Arscale = util.global.load @"__auto.blk.27.attn_output.q_input:rscale" : tensor<f32>
    %660 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_output.weight3Aqs = util.global.load @"__auto.blk.27.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %661 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_output.weight3Ad = util.global.load @"__auto.blk.27.attn_output.weight:d" : tensor<f32>
    %662 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xbf16>
    %663 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.27.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_gate.q_input:rscale" : tensor<f32>
    %664 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.27.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %665 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_gate.weight3Ad = util.global.load @"__auto.blk.27.ffn_gate.weight:d" : tensor<f32>
    %666 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_up.q_input:rscale" : tensor<f32>
    %667 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_up.weight3Aqs = util.global.load @"__auto.blk.27.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %668 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_up.weight3Ad = util.global.load @"__auto.blk.27.ffn_up.weight:d" : tensor<f32>
    %669 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_down.q_input:rscale" : tensor<f32>
    %670 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_down.weight3Aqs = util.global.load @"__auto.blk.27.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %671 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.27.ffn_down.weight3Ad = util.global.load @"__auto.blk.27.ffn_down.weight:d" : tensor<f32>
    %672 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xbf16>
    %673 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.28.attn_q.q_input3Arscale = util.global.load @"__auto.blk.28.attn_q.q_input:rscale" : tensor<f32>
    %674 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_q.weight3Aqs = util.global.load @"__auto.blk.28.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %675 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_q.q_output3Arscale = util.global.load @"__auto.blk.28.attn_q.q_output:rscale" : tensor<f32>
    %676 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_k.q_input3Arscale = util.global.load @"__auto.blk.28.attn_k.q_input:rscale" : tensor<f32>
    %677 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_k.weight3Aqs = util.global.load @"__auto.blk.28.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %678 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_k.q_output3Arscale = util.global.load @"__auto.blk.28.attn_k.q_output:rscale" : tensor<f32>
    %679 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_v.q_input3Arscale = util.global.load @"__auto.blk.28.attn_v.q_input:rscale" : tensor<f32>
    %680 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_v.weight3Aqs = util.global.load @"__auto.blk.28.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %681 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_v.q_output3Arscale = util.global.load @"__auto.blk.28.attn_v.q_output:rscale" : tensor<f32>
    %682 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %683 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.28.attn_output.q_input3Arscale = util.global.load @"__auto.blk.28.attn_output.q_input:rscale" : tensor<f32>
    %684 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_output.weight3Aqs = util.global.load @"__auto.blk.28.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %685 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_output.weight3Ad = util.global.load @"__auto.blk.28.attn_output.weight:d" : tensor<f32>
    %686 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xbf16>
    %687 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.28.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_gate.q_input:rscale" : tensor<f32>
    %688 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.28.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %689 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_gate.weight3Ad = util.global.load @"__auto.blk.28.ffn_gate.weight:d" : tensor<f32>
    %690 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_up.q_input:rscale" : tensor<f32>
    %691 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_up.weight3Aqs = util.global.load @"__auto.blk.28.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %692 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_up.weight3Ad = util.global.load @"__auto.blk.28.ffn_up.weight:d" : tensor<f32>
    %693 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_down.q_input:rscale" : tensor<f32>
    %694 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_down.weight3Aqs = util.global.load @"__auto.blk.28.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %695 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.28.ffn_down.weight3Ad = util.global.load @"__auto.blk.28.ffn_down.weight:d" : tensor<f32>
    %696 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xbf16>
    %697 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.29.attn_q.q_input3Arscale = util.global.load @"__auto.blk.29.attn_q.q_input:rscale" : tensor<f32>
    %698 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_q.weight3Aqs = util.global.load @"__auto.blk.29.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %699 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_q.q_output3Arscale = util.global.load @"__auto.blk.29.attn_q.q_output:rscale" : tensor<f32>
    %700 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_k.q_input3Arscale = util.global.load @"__auto.blk.29.attn_k.q_input:rscale" : tensor<f32>
    %701 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_k.weight3Aqs = util.global.load @"__auto.blk.29.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %702 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_k.q_output3Arscale = util.global.load @"__auto.blk.29.attn_k.q_output:rscale" : tensor<f32>
    %703 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_v.q_input3Arscale = util.global.load @"__auto.blk.29.attn_v.q_input:rscale" : tensor<f32>
    %704 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_v.weight3Aqs = util.global.load @"__auto.blk.29.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %705 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_v.q_output3Arscale = util.global.load @"__auto.blk.29.attn_v.q_output:rscale" : tensor<f32>
    %706 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %707 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.29.attn_output.q_input3Arscale = util.global.load @"__auto.blk.29.attn_output.q_input:rscale" : tensor<f32>
    %708 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_output.weight3Aqs = util.global.load @"__auto.blk.29.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %709 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_output.weight3Ad = util.global.load @"__auto.blk.29.attn_output.weight:d" : tensor<f32>
    %710 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xbf16>
    %711 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.29.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_gate.q_input:rscale" : tensor<f32>
    %712 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.29.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %713 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_gate.weight3Ad = util.global.load @"__auto.blk.29.ffn_gate.weight:d" : tensor<f32>
    %714 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_up.q_input:rscale" : tensor<f32>
    %715 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_up.weight3Aqs = util.global.load @"__auto.blk.29.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %716 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_up.weight3Ad = util.global.load @"__auto.blk.29.ffn_up.weight:d" : tensor<f32>
    %717 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_down.q_input:rscale" : tensor<f32>
    %718 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_down.weight3Aqs = util.global.load @"__auto.blk.29.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %719 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.29.ffn_down.weight3Ad = util.global.load @"__auto.blk.29.ffn_down.weight:d" : tensor<f32>
    %720 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xbf16>
    %721 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.30.attn_q.q_input3Arscale = util.global.load @"__auto.blk.30.attn_q.q_input:rscale" : tensor<f32>
    %722 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_q.weight3Aqs = util.global.load @"__auto.blk.30.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %723 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_q.q_output3Arscale = util.global.load @"__auto.blk.30.attn_q.q_output:rscale" : tensor<f32>
    %724 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_k.q_input3Arscale = util.global.load @"__auto.blk.30.attn_k.q_input:rscale" : tensor<f32>
    %725 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_k.weight3Aqs = util.global.load @"__auto.blk.30.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %726 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_k.q_output3Arscale = util.global.load @"__auto.blk.30.attn_k.q_output:rscale" : tensor<f32>
    %727 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_v.q_input3Arscale = util.global.load @"__auto.blk.30.attn_v.q_input:rscale" : tensor<f32>
    %728 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_v.weight3Aqs = util.global.load @"__auto.blk.30.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %729 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_v.q_output3Arscale = util.global.load @"__auto.blk.30.attn_v.q_output:rscale" : tensor<f32>
    %730 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %731 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.30.attn_output.q_input3Arscale = util.global.load @"__auto.blk.30.attn_output.q_input:rscale" : tensor<f32>
    %732 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_output.weight3Aqs = util.global.load @"__auto.blk.30.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %733 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_output.weight3Ad = util.global.load @"__auto.blk.30.attn_output.weight:d" : tensor<f32>
    %734 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xbf16>
    %735 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.30.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_gate.q_input:rscale" : tensor<f32>
    %736 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.30.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %737 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_gate.weight3Ad = util.global.load @"__auto.blk.30.ffn_gate.weight:d" : tensor<f32>
    %738 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_up.q_input:rscale" : tensor<f32>
    %739 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_up.weight3Aqs = util.global.load @"__auto.blk.30.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %740 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_up.weight3Ad = util.global.load @"__auto.blk.30.ffn_up.weight:d" : tensor<f32>
    %741 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_down.q_input:rscale" : tensor<f32>
    %742 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_down.weight3Aqs = util.global.load @"__auto.blk.30.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %743 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.30.ffn_down.weight3Ad = util.global.load @"__auto.blk.30.ffn_down.weight:d" : tensor<f32>
    %744 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xbf16>
    %745 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.31.attn_q.q_input3Arscale = util.global.load @"__auto.blk.31.attn_q.q_input:rscale" : tensor<f32>
    %746 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_q.weight3Aqs = util.global.load @"__auto.blk.31.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %747 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_q.q_output3Arscale = util.global.load @"__auto.blk.31.attn_q.q_output:rscale" : tensor<f32>
    %748 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_k.q_input3Arscale = util.global.load @"__auto.blk.31.attn_k.q_input:rscale" : tensor<f32>
    %749 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_k.weight3Aqs = util.global.load @"__auto.blk.31.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %750 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_k.q_output3Arscale = util.global.load @"__auto.blk.31.attn_k.q_output:rscale" : tensor<f32>
    %751 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_v.q_input3Arscale = util.global.load @"__auto.blk.31.attn_v.q_input:rscale" : tensor<f32>
    %752 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_v.weight3Aqs = util.global.load @"__auto.blk.31.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %753 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_v.q_output3Arscale = util.global.load @"__auto.blk.31.attn_v.q_output:rscale" : tensor<f32>
    %754 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %755 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32>
    %__auto.blk.31.attn_output.q_input3Arscale = util.global.load @"__auto.blk.31.attn_output.q_input:rscale" : tensor<f32>
    %756 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_output.weight3Aqs = util.global.load @"__auto.blk.31.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %757 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_output.weight3Ad = util.global.load @"__auto.blk.31.attn_output.weight:d" : tensor<f32>
    %758 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xbf16>
    %759 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.31.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_gate.q_input:rscale" : tensor<f32>
    %760 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.31.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %761 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_gate.weight3Ad = util.global.load @"__auto.blk.31.ffn_gate.weight:d" : tensor<f32>
    %762 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_up.q_input:rscale" : tensor<f32>
    %763 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_up.weight3Aqs = util.global.load @"__auto.blk.31.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %764 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_up.weight3Ad = util.global.load @"__auto.blk.31.ffn_up.weight:d" : tensor<f32>
    %765 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_down.q_input:rscale" : tensor<f32>
    %766 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_down.weight3Aqs = util.global.load @"__auto.blk.31.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %767 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.31.ffn_down.weight3Ad = util.global.load @"__auto.blk.31.ffn_down.weight:d" : tensor<f32>
    %768 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xbf16>
    %769 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xbf16>
    %770 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
    %771 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %772 = torch.symbolic_int "32*s1" {min_val = 64, max_val = 131040} : !torch.int
    %773 = torch.symbolic_int "s1" {min_val = 2, max_val = 4095} : !torch.int
    %774 = torch.symbolic_int "s2" {min_val = 0, max_val = 9223372036854775807} : !torch.int
    torch.bind_symbolic_shape %arg0, [%773], affine_map<()[s0] -> (4, s0 * 32)> : !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %arg2, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %771, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int1 = torch.constant.int 1
    %775 = torch.aten.size.int %arg2, %int1 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.int
    %int0 = torch.constant.int 0
    %776 = torch.aten.size.int %771, %int0 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_0 = torch.constant.int 1
    %777 = torch.aten.size.int %arg0, %int1_0 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.int
    %int0_1 = torch.constant.int 0
    %int1_2 = torch.constant.int 1
    %none = torch.constant.none
    %none_3 = torch.constant.none
    %cpu = torch.constant.device "cpu"
    %false = torch.constant.bool false
    %778 = torch.aten.arange.start_step %int0_1, %777, %int1_2, %none, %none_3, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %778, [%773], affine_map<()[s0] -> (s0 * 32)> : !torch.vtensor<[?],si64>
    %int-1 = torch.constant.int -1
    %779 = torch.aten.unsqueeze %arg1, %int-1 : !torch.vtensor<[4],si64>, !torch.int -> !torch.vtensor<[4,1],si64>
    %780 = torch.aten.ge.Tensor %778, %779 : !torch.vtensor<[?],si64>, !torch.vtensor<[4,1],si64> -> !torch.vtensor<[4,?],i1>
    torch.bind_symbolic_shape %780, [%773], affine_map<()[s0] -> (4, s0 * 32)> : !torch.vtensor<[4,?],i1>
    %int1_4 = torch.constant.int 1
    %int1_5 = torch.constant.int 1
    %781 = torch.prim.ListConstruct %int1_4, %int1_5 : (!torch.int, !torch.int) -> !torch.list<int>
    %int11 = torch.constant.int 11
    %none_6 = torch.constant.none
    %cpu_7 = torch.constant.device "cpu"
    %false_8 = torch.constant.bool false
    %782 = torch.aten.ones %781, %int11, %none_6, %cpu_7, %false_8 : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,1],i1>
    %int131072 = torch.constant.int 131072
    %int131072_9 = torch.constant.int 131072
    %783 = torch.prim.ListConstruct %int131072, %int131072_9 : (!torch.int, !torch.int) -> !torch.list<int>
    %false_10 = torch.constant.bool false
    %784 = torch.aten.expand %782, %783, %false_10 : !torch.vtensor<[1,1],i1>, !torch.list<int>, !torch.bool -> !torch.vtensor<[131072,131072],i1>
    %int1_11 = torch.constant.int 1
    %785 = torch.aten.triu %784, %int1_11 : !torch.vtensor<[131072,131072],i1>, !torch.int -> !torch.vtensor<[131072,131072],i1>
    %int0_12 = torch.constant.int 0
    %786 = torch.aten.unsqueeze %785, %int0_12 : !torch.vtensor<[131072,131072],i1>, !torch.int -> !torch.vtensor<[1,131072,131072],i1>
    %int1_13 = torch.constant.int 1
    %787 = torch.aten.unsqueeze %786, %int1_13 : !torch.vtensor<[1,131072,131072],i1>, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1>
    %int2 = torch.constant.int 2
    %int0_14 = torch.constant.int 0
    %int9223372036854775807 = torch.constant.int 9223372036854775807
    %int1_15 = torch.constant.int 1
    %788 = torch.aten.slice.Tensor %787, %int2, %int0_14, %int9223372036854775807, %int1_15 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1>
    %int3 = torch.constant.int 3
    %int0_16 = torch.constant.int 0
    %int9223372036854775807_17 = torch.constant.int 9223372036854775807
    %int1_18 = torch.constant.int 1
    %789 = torch.aten.slice.Tensor %788, %int3, %int0_16, %int9223372036854775807_17, %int1_18 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1>
    %int0_19 = torch.constant.int 0
    %int0_20 = torch.constant.int 0
    %int9223372036854775807_21 = torch.constant.int 9223372036854775807
    %int1_22 = torch.constant.int 1
    %790 = torch.aten.slice.Tensor %789, %int0_19, %int0_20, %int9223372036854775807_21, %int1_22 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1>
    %int1_23 = torch.constant.int 1
    %int0_24 = torch.constant.int 0
    %int9223372036854775807_25 = torch.constant.int 9223372036854775807
    %int1_26 = torch.constant.int 1
    %791 = torch.aten.slice.Tensor %790, %int1_23, %int0_24, %int9223372036854775807_25, %int1_26 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1>
    %int2_27 = torch.constant.int 2
    %int0_28 = torch.constant.int 0
    %int1_29 = torch.constant.int 1
    %792 = torch.aten.slice.Tensor %791, %int2_27, %int0_28, %777, %int1_29 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,131072],i1>
    torch.bind_symbolic_shape %792, [%773], affine_map<()[s0] -> (1, 1, s0 * 32, 131072)> : !torch.vtensor<[1,1,?,131072],i1>
    %int3_30 = torch.constant.int 3
    %int0_31 = torch.constant.int 0
    %int1_32 = torch.constant.int 1
    %793 = torch.aten.slice.Tensor %792, %int3_30, %int0_31, %777, %int1_32 : !torch.vtensor<[1,1,?,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,?],i1>
    torch.bind_symbolic_shape %793, [%773], affine_map<()[s0] -> (1, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,1,?,?],i1>
    %int0_33 = torch.constant.int 0
    %int0_34 = torch.constant.int 0
    %int9223372036854775807_35 = torch.constant.int 9223372036854775807
    %int1_36 = torch.constant.int 1
    %794 = torch.aten.slice.Tensor %780, %int0_33, %int0_34, %int9223372036854775807_35, %int1_36 : !torch.vtensor<[4,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?],i1>
    torch.bind_symbolic_shape %794, [%773], affine_map<()[s0] -> (4, s0 * 32)> : !torch.vtensor<[4,?],i1>
    %int1_37 = torch.constant.int 1
    %795 = torch.aten.unsqueeze %794, %int1_37 : !torch.vtensor<[4,?],i1>, !torch.int -> !torch.vtensor<[4,1,?],i1>
    torch.bind_symbolic_shape %795, [%773], affine_map<()[s0] -> (4, 1, s0 * 32)> : !torch.vtensor<[4,1,?],i1>
    %int2_38 = torch.constant.int 2
    %796 = torch.aten.unsqueeze %795, %int2_38 : !torch.vtensor<[4,1,?],i1>, !torch.int -> !torch.vtensor<[4,1,1,?],i1>
    torch.bind_symbolic_shape %796, [%773], affine_map<()[s0] -> (4, 1, 1, s0 * 32)> : !torch.vtensor<[4,1,1,?],i1>
    %int3_39 = torch.constant.int 3
    %int0_40 = torch.constant.int 0
    %int9223372036854775807_41 = torch.constant.int 9223372036854775807
    %int1_42 = torch.constant.int 1
    %797 = torch.aten.slice.Tensor %796, %int3_39, %int0_40, %int9223372036854775807_41, %int1_42 : !torch.vtensor<[4,1,1,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,1,1,?],i1>
    torch.bind_symbolic_shape %797, [%773], affine_map<()[s0] -> (4, 1, 1, s0 * 32)> : !torch.vtensor<[4,1,1,?],i1>
    %798 = torch.aten.logical_or %793, %797 : !torch.vtensor<[1,1,?,?],i1>, !torch.vtensor<[4,1,1,?],i1> -> !torch.vtensor<[4,1,?,?],i1>
    torch.bind_symbolic_shape %798, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],i1>
    %int0_43 = torch.constant.int 0
    %int6 = torch.constant.int 6
    %int0_44 = torch.constant.int 0
    %cpu_45 = torch.constant.device "cpu"
    %none_46 = torch.constant.none
    %799 = torch.aten.scalar_tensor %int0_43, %int6, %int0_44, %cpu_45, %none_46 : !torch.int, !torch.int, !torch.int, !torch.Device, !torch.none -> !torch.vtensor<[],f32>
    %float-Inf = torch.constant.float 0xFFF0000000000000
    %int6_47 = torch.constant.int 6
    %int0_48 = torch.constant.int 0
    %cpu_49 = torch.constant.device "cpu"
    %none_50 = torch.constant.none
    %800 = torch.aten.scalar_tensor %float-Inf, %int6_47, %int0_48, %cpu_49, %none_50 : !torch.float, !torch.int, !torch.int, !torch.Device, !torch.none -> !torch.vtensor<[],f32>
    %801 = torch.aten.where.self %798, %800, %799 : !torch.vtensor<[4,1,?,?],i1>, !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,1,?,?],f32>
    torch.bind_symbolic_shape %801, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f32>
    %int6_51 = torch.constant.int 6
    %802 = torch.prims.convert_element_type %801, %int6_51 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f32>
    torch.bind_symbolic_shape %802, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f32>
    %int6_52 = torch.constant.int 6
    %803 = torch.prims.convert_element_type %802, %int6_52 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f32>
    torch.bind_symbolic_shape %803, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f32>
    %int15 = torch.constant.int 15
    %804 = torch.prims.convert_element_type %0, %int15 : !torch.vtensor<[128256,4096],bf16>, !torch.int -> !torch.vtensor<[128256,4096],bf16>
    %int-1_53 = torch.constant.int -1
    %false_54 = torch.constant.bool false
    %false_55 = torch.constant.bool false
    %805 = torch.aten.embedding %804, %arg0, %int-1_53, %false_54, %false_55 : !torch.vtensor<[128256,4096],bf16>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %805, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %int6_56 = torch.constant.int 6
    %806 = torch.prims.convert_element_type %805, %int6_56 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %806, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_57 = torch.constant.int 2
    %807 = torch.aten.pow.Tensor_Scalar %806, %int2_57 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %807, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_58 = torch.constant.int -1
    %808 = torch.prim.ListConstruct %int-1_58 : (!torch.int) -> !torch.list<int>
    %true = torch.constant.bool true
    %none_59 = torch.constant.none
    %809 = torch.aten.mean.dim %807, %808, %true, %none_59 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %809, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05 = torch.constant.float 1.000000e-05
    %int1_60 = torch.constant.int 1
    %810 = torch.aten.add.Scalar %809, %float1.000000e-05, %int1_60 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %810, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %811 = torch.aten.rsqrt %810 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %811, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %812 = torch.aten.mul.Tensor %806, %811 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %812, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int15_61 = torch.constant.int 15
    %813 = torch.prims.convert_element_type %812, %int15_61 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %813, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %814 = torch.aten.mul.Tensor %1, %813 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16> -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %814, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %int15_62 = torch.constant.int 15
    %815 = torch.prims.convert_element_type %814, %int15_62 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %815, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %816 = torch.aten.div.Tensor %815, %2 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %816, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %float-2.400000e02 = torch.constant.float -2.400000e+02
    %float2.400000e02 = torch.constant.float 2.400000e+02
    %817 = torch.aten.clamp %816, %float-2.400000e02, %float2.400000e02 : !torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %817, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %int26 = torch.constant.int 26
    %818 = torch.prims.convert_element_type %817, %int26 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %818, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_63 = torch.constant.int 0
    %819 = torch.aten.unsqueeze %3, %int0_63 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4 = torch.constant.int 4
    %int4096 = torch.constant.int 4096
    %int4096_64 = torch.constant.int 4096
    %820 = torch.prim.ListConstruct %int4, %int4096, %int4096_64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_65 = torch.constant.bool false
    %821 = torch.aten.expand %819, %820, %false_65 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %822 = torch_c.to_builtin_tensor %818 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %823 = torch_c.to_builtin_tensor %821 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %824 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%822, %823) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %825 = torch_c.from_builtin_tensor %824 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %825, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %826 = torch.aten.div.Tensor %825, %4 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %826, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_66 = torch.constant.float -2.400000e+02
    %float2.400000e02_67 = torch.constant.float 2.400000e+02
    %827 = torch.aten.clamp %826, %float-2.400000e02_66, %float2.400000e02_67 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %827, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_68 = torch.constant.int 26
    %828 = torch.prims.convert_element_type %827, %int26_68 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %828, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %829 = torch.aten.div.Tensor %815, %5 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %829, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %float-2.400000e02_69 = torch.constant.float -2.400000e+02
    %float2.400000e02_70 = torch.constant.float 2.400000e+02
    %830 = torch.aten.clamp %829, %float-2.400000e02_69, %float2.400000e02_70 : !torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %830, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %int26_71 = torch.constant.int 26
    %831 = torch.prims.convert_element_type %830, %int26_71 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %831, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_72 = torch.constant.int 0
    %832 = torch.aten.unsqueeze %6, %int0_72 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_73 = torch.constant.int 4
    %int1024 = torch.constant.int 1024
    %int4096_74 = torch.constant.int 4096
    %833 = torch.prim.ListConstruct %int4_73, %int1024, %int4096_74 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_75 = torch.constant.bool false
    %834 = torch.aten.expand %832, %833, %false_75 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %835 = torch_c.to_builtin_tensor %831 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %836 = torch_c.to_builtin_tensor %834 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %837 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%835, %836) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %838 = torch_c.from_builtin_tensor %837 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %838, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %839 = torch.aten.div.Tensor %838, %7 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %839, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_76 = torch.constant.float -2.400000e+02
    %float2.400000e02_77 = torch.constant.float 2.400000e+02
    %840 = torch.aten.clamp %839, %float-2.400000e02_76, %float2.400000e02_77 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %840, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_78 = torch.constant.int 26
    %841 = torch.prims.convert_element_type %840, %int26_78 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %841, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %842 = torch.aten.div.Tensor %815, %8 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %842, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %float-2.400000e02_79 = torch.constant.float -2.400000e+02
    %float2.400000e02_80 = torch.constant.float 2.400000e+02
    %843 = torch.aten.clamp %842, %float-2.400000e02_79, %float2.400000e02_80 : !torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],bf16>
    torch.bind_symbolic_shape %843, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16>
    %int26_81 = torch.constant.int 26
    %844 = torch.prims.convert_element_type %843, %int26_81 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %844, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_82 = torch.constant.int 0
    %845 = torch.aten.unsqueeze %9, %int0_82 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_83 = torch.constant.int 4
    %int1024_84 = torch.constant.int 1024
    %int4096_85 = torch.constant.int 4096
    %846 = torch.prim.ListConstruct %int4_83, %int1024_84, %int4096_85 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_86 = torch.constant.bool false
    %847 = torch.aten.expand %845, %846, %false_86 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %848 = torch_c.to_builtin_tensor %844 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %849 = torch_c.to_builtin_tensor %847 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %850 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%848, %849) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %851 = torch_c.from_builtin_tensor %850 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %851, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %852 = torch.aten.div.Tensor %851, %10 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %852, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_87 = torch.constant.float -2.400000e+02
    %float2.400000e02_88 = torch.constant.float 2.400000e+02
    %853 = torch.aten.clamp %852, %float-2.400000e02_87, %float2.400000e02_88 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %853, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_89 = torch.constant.int 26
    %854 = torch.prims.convert_element_type %853, %int26_89 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %854, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %int4_90 = torch.constant.int 4
    %int32 = torch.constant.int 32
    %int128 = torch.constant.int 128
    %855 = torch.prim.ListConstruct %int4_90, %777, %int32, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %856 = torch.aten.view %828, %855 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %856, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int4_91 = torch.constant.int 4
    %int8 = torch.constant.int 8
    %int128_92 = torch.constant.int 128
    %857 = torch.prim.ListConstruct %int4_91, %777, %int8, %int128_92 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %858 = torch.aten.view %841, %857 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %858, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int4_93 = torch.constant.int 4
    %int8_94 = torch.constant.int 8
    %int128_95 = torch.constant.int 128
    %859 = torch.prim.ListConstruct %int4_93, %777, %int8_94, %int128_95 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %860 = torch.aten.view %854, %859 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %860, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int131072_96 = torch.constant.int 131072
    %none_97 = torch.constant.none
    %none_98 = torch.constant.none
    %cpu_99 = torch.constant.device "cpu"
    %false_100 = torch.constant.bool false
    %861 = torch.aten.arange %int131072_96, %none_97, %none_98, %cpu_99, %false_100 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_101 = torch.constant.int 0
    %int128_102 = torch.constant.int 128
    %int2_103 = torch.constant.int 2
    %int4_104 = torch.constant.int 4
    %none_105 = torch.constant.none
    %cpu_106 = torch.constant.device "cpu"
    %false_107 = torch.constant.bool false
    %862 = torch.aten.arange.start_step %int0_101, %int128_102, %int2_103, %int4_104, %none_105, %cpu_106, %false_107 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_108 = torch.constant.int 6
    %863 = torch.prims.convert_element_type %862, %int6_108 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_109 = torch.constant.int 128
    %864 = torch.aten.div.Scalar %863, %int128_109 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05 = torch.constant.float 5.000000e+05
    %865 = torch.aten.pow.Scalar %float5.000000e05, %864 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %866 = torch.aten.reciprocal %865 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00 = torch.constant.float 1.000000e+00
    %867 = torch.aten.mul.Scalar %866, %float1.000000e00 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %868 = torch.aten.reciprocal %867 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00 = torch.constant.float 6.2831853071795862
    %869 = torch.aten.mul.Scalar %868, %float6.283190e00 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03 = torch.constant.float 8.192000e+03
    %870 = torch.aten.gt.Scalar %869, %float8.192000e03 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_110 = torch.constant.int 8
    %871 = torch.aten.div.Scalar %867, %int8_110 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %872 = torch.aten.where.self %870, %871, %867 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %873 = torch.aten.reciprocal %869 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192 = torch.constant.int 8192
    %874 = torch.aten.mul.Scalar %873, %int8192 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_111 = torch.constant.int 1
    %int1_112 = torch.constant.int 1
    %875 = torch.aten.sub.Scalar %874, %int1_111, %int1_112 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_113 = torch.constant.int 3
    %876 = torch.aten.div.Scalar %875, %int3_113 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_114 = torch.constant.int 1
    %int1_115 = torch.constant.int 1
    %877 = torch.aten.rsub.Scalar %876, %int1_114, %int1_115 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %878 = torch.aten.mul.Tensor %877, %872 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_116 = torch.constant.int 8
    %879 = torch.aten.div.Scalar %878, %int8_116 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %880 = torch.aten.mul.Tensor %876, %872 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_117 = torch.constant.int 1
    %881 = torch.aten.add.Tensor %879, %880, %int1_117 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03 = torch.constant.float 2.048000e+03
    %882 = torch.aten.lt.Scalar %869, %float2.048000e03 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %883 = torch.aten.bitwise_not %882 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_118 = torch.constant.float 8.192000e+03
    %884 = torch.aten.gt.Scalar %869, %float8.192000e03_118 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %885 = torch.aten.bitwise_not %884 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %886 = torch.aten.mul.Tensor %883, %885 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %887 = torch.aten.where.self %886, %881, %872 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %888 = torch.prim.ListConstruct %887, %887 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_119 = torch.constant.int -1
    %889 = torch.aten.cat %888, %int-1_119 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_120 = torch.constant.int 6
    %890 = torch.prims.convert_element_type %889, %int6_120 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_121 = torch.constant.int 1
    %891 = torch.aten.unsqueeze %861, %int1_121 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_122 = torch.constant.int 6
    %892 = torch.prims.convert_element_type %891, %int6_122 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_123 = torch.constant.int 0
    %893 = torch.aten.unsqueeze %890, %int0_123 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_124 = torch.constant.int 6
    %894 = torch.prims.convert_element_type %893, %int6_124 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %895 = torch.aten.mul.Tensor %892, %894 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %896 = torch.aten.cos %895 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_125 = torch.constant.int 15
    %897 = torch.prims.convert_element_type %896, %int15_125 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %898 = torch.aten.sin %895 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_126 = torch.constant.int 15
    %899 = torch.prims.convert_element_type %898, %int15_126 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_127 = torch.constant.int 0
    %int0_128 = torch.constant.int 0
    %int1_129 = torch.constant.int 1
    %900 = torch.aten.slice.Tensor %897, %int0_127, %int0_128, %777, %int1_129 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %900, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_130 = torch.constant.int 1
    %int0_131 = torch.constant.int 0
    %int9223372036854775807_132 = torch.constant.int 9223372036854775807
    %int1_133 = torch.constant.int 1
    %901 = torch.aten.slice.Tensor %900, %int1_130, %int0_131, %int9223372036854775807_132, %int1_133 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %901, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_134 = torch.constant.int 0
    %int0_135 = torch.constant.int 0
    %int1_136 = torch.constant.int 1
    %902 = torch.aten.slice.Tensor %899, %int0_134, %int0_135, %777, %int1_136 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %902, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_137 = torch.constant.int 1
    %int0_138 = torch.constant.int 0
    %int9223372036854775807_139 = torch.constant.int 9223372036854775807
    %int1_140 = torch.constant.int 1
    %903 = torch.aten.slice.Tensor %902, %int1_137, %int0_138, %int9223372036854775807_139, %int1_140 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %903, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_141 = torch.constant.int 0
    %904 = torch.aten.unsqueeze %901, %int0_141 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %904, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_142 = torch.constant.int 1
    %int0_143 = torch.constant.int 0
    %int9223372036854775807_144 = torch.constant.int 9223372036854775807
    %int1_145 = torch.constant.int 1
    %905 = torch.aten.slice.Tensor %904, %int1_142, %int0_143, %int9223372036854775807_144, %int1_145 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %905, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_146 = torch.constant.int 2
    %906 = torch.aten.unsqueeze %905, %int2_146 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %906, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_147 = torch.constant.int 3
    %int0_148 = torch.constant.int 0
    %int9223372036854775807_149 = torch.constant.int 9223372036854775807
    %int1_150 = torch.constant.int 1
    %907 = torch.aten.slice.Tensor %906, %int3_147, %int0_148, %int9223372036854775807_149, %int1_150 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %907, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_151 = torch.constant.int 4
    %int1_152 = torch.constant.int 1
    %int1_153 = torch.constant.int 1
    %int1_154 = torch.constant.int 1
    %908 = torch.prim.ListConstruct %int4_151, %int1_152, %int1_153, %int1_154 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %909 = torch.aten.repeat %907, %908 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %909, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_155 = torch.constant.int 0
    %910 = torch.aten.unsqueeze %903, %int0_155 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %910, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_156 = torch.constant.int 1
    %int0_157 = torch.constant.int 0
    %int9223372036854775807_158 = torch.constant.int 9223372036854775807
    %int1_159 = torch.constant.int 1
    %911 = torch.aten.slice.Tensor %910, %int1_156, %int0_157, %int9223372036854775807_158, %int1_159 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %911, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_160 = torch.constant.int 2
    %912 = torch.aten.unsqueeze %911, %int2_160 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %912, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_161 = torch.constant.int 3
    %int0_162 = torch.constant.int 0
    %int9223372036854775807_163 = torch.constant.int 9223372036854775807
    %int1_164 = torch.constant.int 1
    %913 = torch.aten.slice.Tensor %912, %int3_161, %int0_162, %int9223372036854775807_163, %int1_164 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %913, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_165 = torch.constant.int 4
    %int1_166 = torch.constant.int 1
    %int1_167 = torch.constant.int 1
    %int1_168 = torch.constant.int 1
    %914 = torch.prim.ListConstruct %int4_165, %int1_166, %int1_167, %int1_168 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %915 = torch.aten.repeat %913, %914 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %915, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %916 = torch.aten.mul.Tensor %856, %909 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %916, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int3_169 = torch.constant.int 3
    %int0_170 = torch.constant.int 0
    %int64 = torch.constant.int 64
    %int1_171 = torch.constant.int 1
    %917 = torch.aten.slice.Tensor %856, %int3_169, %int0_170, %int64, %int1_171 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %917, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %int3_172 = torch.constant.int 3
    %int64_173 = torch.constant.int 64
    %int9223372036854775807_174 = torch.constant.int 9223372036854775807
    %int1_175 = torch.constant.int 1
    %918 = torch.aten.slice.Tensor %856, %int3_172, %int64_173, %int9223372036854775807_174, %int1_175 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %918, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %919 = torch.aten.neg %918 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %919, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %920 = torch.prim.ListConstruct %919, %917 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_176 = torch.constant.int -1
    %921 = torch.aten.cat %920, %int-1_176 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %921, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %922 = torch.aten.mul.Tensor %921, %915 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %922, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_177 = torch.constant.int 1
    %923 = torch.aten.add.Tensor %916, %922, %int1_177 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %923, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int131072_178 = torch.constant.int 131072
    %none_179 = torch.constant.none
    %none_180 = torch.constant.none
    %cpu_181 = torch.constant.device "cpu"
    %false_182 = torch.constant.bool false
    %924 = torch.aten.arange %int131072_178, %none_179, %none_180, %cpu_181, %false_182 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_183 = torch.constant.int 0
    %int128_184 = torch.constant.int 128
    %int2_185 = torch.constant.int 2
    %int4_186 = torch.constant.int 4
    %none_187 = torch.constant.none
    %cpu_188 = torch.constant.device "cpu"
    %false_189 = torch.constant.bool false
    %925 = torch.aten.arange.start_step %int0_183, %int128_184, %int2_185, %int4_186, %none_187, %cpu_188, %false_189 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_190 = torch.constant.int 6
    %926 = torch.prims.convert_element_type %925, %int6_190 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_191 = torch.constant.int 128
    %927 = torch.aten.div.Scalar %926, %int128_191 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_192 = torch.constant.float 5.000000e+05
    %928 = torch.aten.pow.Scalar %float5.000000e05_192, %927 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %929 = torch.aten.reciprocal %928 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_193 = torch.constant.float 1.000000e+00
    %930 = torch.aten.mul.Scalar %929, %float1.000000e00_193 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %931 = torch.aten.reciprocal %930 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_194 = torch.constant.float 6.2831853071795862
    %932 = torch.aten.mul.Scalar %931, %float6.283190e00_194 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_195 = torch.constant.float 8.192000e+03
    %933 = torch.aten.gt.Scalar %932, %float8.192000e03_195 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_196 = torch.constant.int 8
    %934 = torch.aten.div.Scalar %930, %int8_196 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %935 = torch.aten.where.self %933, %934, %930 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %936 = torch.aten.reciprocal %932 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_197 = torch.constant.int 8192
    %937 = torch.aten.mul.Scalar %936, %int8192_197 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_198 = torch.constant.int 1
    %int1_199 = torch.constant.int 1
    %938 = torch.aten.sub.Scalar %937, %int1_198, %int1_199 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_200 = torch.constant.int 3
    %939 = torch.aten.div.Scalar %938, %int3_200 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_201 = torch.constant.int 1
    %int1_202 = torch.constant.int 1
    %940 = torch.aten.rsub.Scalar %939, %int1_201, %int1_202 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %941 = torch.aten.mul.Tensor %940, %935 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_203 = torch.constant.int 8
    %942 = torch.aten.div.Scalar %941, %int8_203 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %943 = torch.aten.mul.Tensor %939, %935 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_204 = torch.constant.int 1
    %944 = torch.aten.add.Tensor %942, %943, %int1_204 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_205 = torch.constant.float 2.048000e+03
    %945 = torch.aten.lt.Scalar %932, %float2.048000e03_205 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %946 = torch.aten.bitwise_not %945 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_206 = torch.constant.float 8.192000e+03
    %947 = torch.aten.gt.Scalar %932, %float8.192000e03_206 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %948 = torch.aten.bitwise_not %947 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %949 = torch.aten.mul.Tensor %946, %948 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %950 = torch.aten.where.self %949, %944, %935 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %951 = torch.prim.ListConstruct %950, %950 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_207 = torch.constant.int -1
    %952 = torch.aten.cat %951, %int-1_207 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_208 = torch.constant.int 6
    %953 = torch.prims.convert_element_type %952, %int6_208 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_209 = torch.constant.int 1
    %954 = torch.aten.unsqueeze %924, %int1_209 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_210 = torch.constant.int 6
    %955 = torch.prims.convert_element_type %954, %int6_210 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_211 = torch.constant.int 0
    %956 = torch.aten.unsqueeze %953, %int0_211 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_212 = torch.constant.int 6
    %957 = torch.prims.convert_element_type %956, %int6_212 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %958 = torch.aten.mul.Tensor %955, %957 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %959 = torch.aten.cos %958 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_213 = torch.constant.int 15
    %960 = torch.prims.convert_element_type %959, %int15_213 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %961 = torch.aten.sin %958 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_214 = torch.constant.int 15
    %962 = torch.prims.convert_element_type %961, %int15_214 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_215 = torch.constant.int 0
    %int0_216 = torch.constant.int 0
    %int1_217 = torch.constant.int 1
    %963 = torch.aten.slice.Tensor %960, %int0_215, %int0_216, %777, %int1_217 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %963, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_218 = torch.constant.int 1
    %int0_219 = torch.constant.int 0
    %int9223372036854775807_220 = torch.constant.int 9223372036854775807
    %int1_221 = torch.constant.int 1
    %964 = torch.aten.slice.Tensor %963, %int1_218, %int0_219, %int9223372036854775807_220, %int1_221 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %964, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_222 = torch.constant.int 0
    %int0_223 = torch.constant.int 0
    %int1_224 = torch.constant.int 1
    %965 = torch.aten.slice.Tensor %962, %int0_222, %int0_223, %777, %int1_224 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %965, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_225 = torch.constant.int 1
    %int0_226 = torch.constant.int 0
    %int9223372036854775807_227 = torch.constant.int 9223372036854775807
    %int1_228 = torch.constant.int 1
    %966 = torch.aten.slice.Tensor %965, %int1_225, %int0_226, %int9223372036854775807_227, %int1_228 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %966, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_229 = torch.constant.int 0
    %967 = torch.aten.unsqueeze %964, %int0_229 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %967, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_230 = torch.constant.int 1
    %int0_231 = torch.constant.int 0
    %int9223372036854775807_232 = torch.constant.int 9223372036854775807
    %int1_233 = torch.constant.int 1
    %968 = torch.aten.slice.Tensor %967, %int1_230, %int0_231, %int9223372036854775807_232, %int1_233 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %968, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_234 = torch.constant.int 2
    %969 = torch.aten.unsqueeze %968, %int2_234 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %969, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_235 = torch.constant.int 3
    %int0_236 = torch.constant.int 0
    %int9223372036854775807_237 = torch.constant.int 9223372036854775807
    %int1_238 = torch.constant.int 1
    %970 = torch.aten.slice.Tensor %969, %int3_235, %int0_236, %int9223372036854775807_237, %int1_238 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %970, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_239 = torch.constant.int 4
    %int1_240 = torch.constant.int 1
    %int1_241 = torch.constant.int 1
    %int1_242 = torch.constant.int 1
    %971 = torch.prim.ListConstruct %int4_239, %int1_240, %int1_241, %int1_242 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %972 = torch.aten.repeat %970, %971 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %972, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_243 = torch.constant.int 0
    %973 = torch.aten.unsqueeze %966, %int0_243 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %973, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_244 = torch.constant.int 1
    %int0_245 = torch.constant.int 0
    %int9223372036854775807_246 = torch.constant.int 9223372036854775807
    %int1_247 = torch.constant.int 1
    %974 = torch.aten.slice.Tensor %973, %int1_244, %int0_245, %int9223372036854775807_246, %int1_247 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %974, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_248 = torch.constant.int 2
    %975 = torch.aten.unsqueeze %974, %int2_248 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %975, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_249 = torch.constant.int 3
    %int0_250 = torch.constant.int 0
    %int9223372036854775807_251 = torch.constant.int 9223372036854775807
    %int1_252 = torch.constant.int 1
    %976 = torch.aten.slice.Tensor %975, %int3_249, %int0_250, %int9223372036854775807_251, %int1_252 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %976, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_253 = torch.constant.int 4
    %int1_254 = torch.constant.int 1
    %int1_255 = torch.constant.int 1
    %int1_256 = torch.constant.int 1
    %977 = torch.prim.ListConstruct %int4_253, %int1_254, %int1_255, %int1_256 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %978 = torch.aten.repeat %976, %977 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %978, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %979 = torch.aten.mul.Tensor %858, %972 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %979, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int3_257 = torch.constant.int 3
    %int0_258 = torch.constant.int 0
    %int64_259 = torch.constant.int 64
    %int1_260 = torch.constant.int 1
    %980 = torch.aten.slice.Tensor %858, %int3_257, %int0_258, %int64_259, %int1_260 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %980, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %int3_261 = torch.constant.int 3
    %int64_262 = torch.constant.int 64
    %int9223372036854775807_263 = torch.constant.int 9223372036854775807
    %int1_264 = torch.constant.int 1
    %981 = torch.aten.slice.Tensor %858, %int3_261, %int64_262, %int9223372036854775807_263, %int1_264 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %981, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %982 = torch.aten.neg %981 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %982, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %983 = torch.prim.ListConstruct %982, %980 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_265 = torch.constant.int -1
    %984 = torch.aten.cat %983, %int-1_265 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %984, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %985 = torch.aten.mul.Tensor %984, %978 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %985, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int1_266 = torch.constant.int 1
    %986 = torch.aten.add.Tensor %979, %985, %int1_266 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %986, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int32_267 = torch.constant.int 32
    %int2_268 = torch.constant.int 2
    %int32_269 = torch.constant.int 32
    %int8_270 = torch.constant.int 8
    %int128_271 = torch.constant.int 128
    %987 = torch.prim.ListConstruct %776, %int32_267, %int2_268, %int32_269, %int8_270, %int128_271 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %988 = torch.aten.view %771, %987 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %988, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_272 = torch.constant.int 32
    %989 = torch.aten.mul.int %776, %int32_272 : !torch.int, !torch.int -> !torch.int
    %int2_273 = torch.constant.int 2
    %990 = torch.aten.mul.int %989, %int2_273 : !torch.int, !torch.int -> !torch.int
    %int32_274 = torch.constant.int 32
    %int8_275 = torch.constant.int 8
    %int128_276 = torch.constant.int 128
    %991 = torch.prim.ListConstruct %990, %int32_274, %int8_275, %int128_276 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %992 = torch.aten.view %988, %991 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %992, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int64_277 = torch.constant.int 64
    %993 = torch.aten.mul.Scalar %arg2, %int64_277 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %993, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int0_278 = torch.constant.int 0
    %int1_279 = torch.constant.int 1
    %994 = torch.aten.add.Scalar %993, %int0_278, %int1_279 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %994, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_280 = torch.constant.int 4
    %int32_281 = torch.constant.int 32
    %int8_282 = torch.constant.int 8
    %int128_283 = torch.constant.int 128
    %995 = torch.prim.ListConstruct %int4_280, %775, %int32_281, %int8_282, %int128_283 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %996 = torch.aten.view %986, %995 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %996, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int4_284 = torch.constant.int 4
    %997 = torch.aten.mul.int %int4_284, %775 : !torch.int, !torch.int -> !torch.int
    %int32_285 = torch.constant.int 32
    %int8_286 = torch.constant.int 8
    %int128_287 = torch.constant.int 128
    %998 = torch.prim.ListConstruct %997, %int32_285, %int8_286, %int128_287 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %999 = torch.aten.view %996, %998 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %999, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1000 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %1001 = torch.aten.view %994, %1000 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1001, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_288 = torch.constant.int 26
    %1002 = torch.prims.convert_element_type %999, %int26_288 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1002, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_289 = torch.constant.int 1
    %1003 = torch.aten.view.dtype %992, %int1_289 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1003, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1004 = torch.aten.detach %1003 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1004, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1005 = torch.aten.detach %1004 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1005, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int1_290 = torch.constant.int 1
    %1006 = torch.aten.view.dtype %1002, %int1_290 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1006, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1007 = torch.aten.detach %1006 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1007, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1008 = torch.aten.detach %1007 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1008, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1009 = torch.prim.ListConstruct %1001 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_291 = torch.constant.bool false
    %1010 = torch.aten.index_put %1005, %1009, %1008, %false_291 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1010, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_292 = torch.constant.int 26
    %1011 = torch.aten.view.dtype %1010, %int26_292 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1011, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1012 = torch.aten.detach %1011 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1012, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1013 = torch.aten.detach %1012 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1013, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_293 = torch.constant.int 32
    %int2_294 = torch.constant.int 2
    %int32_295 = torch.constant.int 32
    %int8_296 = torch.constant.int 8
    %int128_297 = torch.constant.int 128
    %1014 = torch.prim.ListConstruct %776, %int32_293, %int2_294, %int32_295, %int8_296, %int128_297 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1015 = torch.aten.view %1013, %1014 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1015, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152 = torch.constant.int 2097152
    %1016 = torch.prim.ListConstruct %776, %int2097152 : (!torch.int, !torch.int) -> !torch.list<int>
    %1017 = torch.aten.view %1015, %1016 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1017, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int4_298 = torch.constant.int 4
    %int32_299 = torch.constant.int 32
    %int8_300 = torch.constant.int 8
    %int128_301 = torch.constant.int 128
    %1018 = torch.prim.ListConstruct %int4_298, %775, %int32_299, %int8_300, %int128_301 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1019 = torch.aten.view %860, %1018 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1019, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_302 = torch.constant.int 32
    %int8_303 = torch.constant.int 8
    %int128_304 = torch.constant.int 128
    %1020 = torch.prim.ListConstruct %997, %int32_302, %int8_303, %int128_304 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1021 = torch.aten.view %1019, %1020 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1021, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_305 = torch.constant.int 1
    %int1_306 = torch.constant.int 1
    %1022 = torch.aten.add.Scalar %994, %int1_305, %int1_306 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %1022, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %1023 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %1024 = torch.aten.view %1022, %1023 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1024, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_307 = torch.constant.int 26
    %1025 = torch.prims.convert_element_type %1021, %int26_307 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1025, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_308 = torch.constant.int 1
    %1026 = torch.aten.view.dtype %1025, %int1_308 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1026, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1027 = torch.aten.detach %1026 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1027, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1028 = torch.aten.detach %1027 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1028, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_309 = torch.constant.int 32
    %int2_310 = torch.constant.int 2
    %int32_311 = torch.constant.int 32
    %int8_312 = torch.constant.int 8
    %int128_313 = torch.constant.int 128
    %1029 = torch.prim.ListConstruct %776, %int32_309, %int2_310, %int32_311, %int8_312, %int128_313 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1030 = torch.aten.view %1017, %1029 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1030, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_314 = torch.constant.int 32
    %int8_315 = torch.constant.int 8
    %int128_316 = torch.constant.int 128
    %1031 = torch.prim.ListConstruct %990, %int32_314, %int8_315, %int128_316 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1032 = torch.aten.view %1030, %1031 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1032, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_317 = torch.constant.int 1
    %1033 = torch.aten.view.dtype %1032, %int1_317 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1033, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1034 = torch.aten.detach %1033 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1034, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1035 = torch.aten.detach %1034 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1035, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1036 = torch.prim.ListConstruct %1024 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_318 = torch.constant.bool false
    %1037 = torch.aten.index_put %1035, %1036, %1028, %false_318 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1037, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_319 = torch.constant.int 26
    %1038 = torch.aten.view.dtype %1037, %int26_319 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1038, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1039 = torch.aten.detach %1038 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1039, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1040 = torch.aten.detach %1039 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1040, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_320 = torch.constant.int 32
    %int2_321 = torch.constant.int 2
    %int32_322 = torch.constant.int 32
    %int8_323 = torch.constant.int 8
    %int128_324 = torch.constant.int 128
    %1041 = torch.prim.ListConstruct %776, %int32_320, %int2_321, %int32_322, %int8_323, %int128_324 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1042 = torch.aten.view %1040, %1041 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1042, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_325 = torch.constant.int 2097152
    %1043 = torch.prim.ListConstruct %776, %int2097152_325 : (!torch.int, !torch.int) -> !torch.list<int>
    %1044 = torch.aten.view %1042, %1043 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1044, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2 = torch.constant.int -2
    %1045 = torch.aten.unsqueeze %986, %int-2 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1045, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_326 = torch.constant.int 4
    %int8_327 = torch.constant.int 8
    %int4_328 = torch.constant.int 4
    %int128_329 = torch.constant.int 128
    %1046 = torch.prim.ListConstruct %int4_326, %777, %int8_327, %int4_328, %int128_329 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_330 = torch.constant.bool false
    %1047 = torch.aten.expand %1045, %1046, %false_330 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1047, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_331 = torch.constant.int 0
    %1048 = torch.aten.clone %1047, %int0_331 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1048, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_332 = torch.constant.int 4
    %int32_333 = torch.constant.int 32
    %int128_334 = torch.constant.int 128
    %1049 = torch.prim.ListConstruct %int4_332, %777, %int32_333, %int128_334 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1050 = torch.aten._unsafe_view %1048, %1049 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1050, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int-2_335 = torch.constant.int -2
    %1051 = torch.aten.unsqueeze %860, %int-2_335 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1051, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_336 = torch.constant.int 4
    %int8_337 = torch.constant.int 8
    %int4_338 = torch.constant.int 4
    %int128_339 = torch.constant.int 128
    %1052 = torch.prim.ListConstruct %int4_336, %777, %int8_337, %int4_338, %int128_339 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_340 = torch.constant.bool false
    %1053 = torch.aten.expand %1051, %1052, %false_340 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1053, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_341 = torch.constant.int 0
    %1054 = torch.aten.clone %1053, %int0_341 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1054, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_342 = torch.constant.int 4
    %int32_343 = torch.constant.int 32
    %int128_344 = torch.constant.int 128
    %1055 = torch.prim.ListConstruct %int4_342, %777, %int32_343, %int128_344 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1056 = torch.aten._unsafe_view %1054, %1055 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1056, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_345 = torch.constant.int 1
    %int2_346 = torch.constant.int 2
    %1057 = torch.aten.transpose.int %923, %int1_345, %int2_346 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1057, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_347 = torch.constant.int 1
    %int2_348 = torch.constant.int 2
    %1058 = torch.aten.transpose.int %1050, %int1_347, %int2_348 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1058, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_349 = torch.constant.int 1
    %int2_350 = torch.constant.int 2
    %1059 = torch.aten.transpose.int %1056, %int1_349, %int2_350 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1059, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_351 = torch.constant.int 26
    %1060 = torch.prims.convert_element_type %1057, %int26_351 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1060, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_352 = torch.constant.int 26
    %1061 = torch.prims.convert_element_type %1058, %int26_352 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1061, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_353 = torch.constant.int 26
    %1062 = torch.prims.convert_element_type %1059, %int26_353 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1062, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_354 = torch.constant.int 26
    %1063 = torch.prims.convert_element_type %803, %int26_354 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1063, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    %int0_355 = torch.constant.int 0
    %int0_356 = torch.constant.int 0
    %1064 = torch.aten.select.int %1063, %int0_355, %int0_356 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1064, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    %int0_357 = torch.constant.int 0
    %int0_358 = torch.constant.int 0
    %1065 = torch.aten.select.int %1064, %int0_357, %int0_358 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1065, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int0_359 = torch.constant.int 0
    %int0_360 = torch.constant.int 0
    %int9223372036854775807_361 = torch.constant.int 9223372036854775807
    %int1_362 = torch.constant.int 1
    %1066 = torch.aten.slice.Tensor %1065, %int0_359, %int0_360, %int9223372036854775807_361, %int1_362 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1066, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int1_363 = torch.constant.int 1
    %int0_364 = torch.constant.int 0
    %int9223372036854775807_365 = torch.constant.int 9223372036854775807
    %int1_366 = torch.constant.int 1
    %1067 = torch.aten.slice.Tensor %1066, %int1_363, %int0_364, %int9223372036854775807_365, %int1_366 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1067, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %none_367 = torch.constant.none
    %1068 = torch.aten.clone %11, %none_367 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
    %1069 = torch.aten.detach %1068 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1070 = torch.aten.detach %1069 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1071 = torch.aten.detach %1070 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1072 = torch_c.to_builtin_tensor %1060 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1073 = torch_c.to_builtin_tensor %1061 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1074 = torch_c.to_builtin_tensor %1062 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1075 = torch_c.to_builtin_tensor %1067 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ>
    %1076 = torch_c.to_builtin_tensor %1071 : !torch.vtensor<[],f32> -> tensor<f32>
    %1077 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%1072, %1073, %1074, %1076, %1075) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %1078 = torch_c.from_builtin_tensor %1077 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32>
    torch.bind_symbolic_shape %1078, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32>
    %int1_368 = torch.constant.int 1
    %int2_369 = torch.constant.int 2
    %1079 = torch.aten.transpose.int %1078, %int1_368, %int2_369 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %1079, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int0_370 = torch.constant.int 0
    %1080 = torch.aten.clone %1079, %int0_370 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %1080, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int4_371 = torch.constant.int 4
    %int4096_372 = torch.constant.int 4096
    %1081 = torch.prim.ListConstruct %int4_371, %777, %int4096_372 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1082 = torch.aten._unsafe_view %1080, %1081 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1082, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1083 = torch.aten.div.Tensor %1082, %12 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1083, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_373 = torch.constant.float -2.400000e+02
    %float2.400000e02_374 = torch.constant.float 2.400000e+02
    %1084 = torch.aten.clamp %1083, %float-2.400000e02_373, %float2.400000e02_374 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1084, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_375 = torch.constant.int 26
    %1085 = torch.prims.convert_element_type %1084, %int26_375 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1085, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_376 = torch.constant.int 0
    %1086 = torch.aten.unsqueeze %13, %int0_376 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_377 = torch.constant.int 4
    %int4096_378 = torch.constant.int 4096
    %int4096_379 = torch.constant.int 4096
    %1087 = torch.prim.ListConstruct %int4_377, %int4096_378, %int4096_379 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_380 = torch.constant.bool false
    %1088 = torch.aten.expand %1086, %1087, %false_380 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %1089 = torch_c.to_builtin_tensor %1085 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1090 = torch_c.to_builtin_tensor %1088 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %1091 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1089, %1090) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1092 = torch_c.from_builtin_tensor %1091 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1092, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1093 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1094 = torch.aten.permute %14, %1093 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1095 = torch.aten.mul.Tensor %12, %1094 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_381 = torch.constant.int 6
    %1096 = torch.prims.convert_element_type %1092, %int6_381 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1096, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1097 = torch.aten.mul.Tensor %1096, %1095 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1097, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_382 = torch.constant.int 1
    %1098 = torch.aten.add.Tensor %805, %1097, %int1_382 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1098, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_383 = torch.constant.int 6
    %1099 = torch.prims.convert_element_type %1098, %int6_383 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1099, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_384 = torch.constant.int 2
    %1100 = torch.aten.pow.Tensor_Scalar %1099, %int2_384 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1100, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_385 = torch.constant.int -1
    %1101 = torch.prim.ListConstruct %int-1_385 : (!torch.int) -> !torch.list<int>
    %true_386 = torch.constant.bool true
    %none_387 = torch.constant.none
    %1102 = torch.aten.mean.dim %1100, %1101, %true_386, %none_387 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1102, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_388 = torch.constant.float 1.000000e-05
    %int1_389 = torch.constant.int 1
    %1103 = torch.aten.add.Scalar %1102, %float1.000000e-05_388, %int1_389 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1103, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1104 = torch.aten.rsqrt %1103 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1104, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1105 = torch.aten.mul.Tensor %1099, %1104 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1105, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_390 = torch.constant.int 6
    %1106 = torch.prims.convert_element_type %1105, %int6_390 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1106, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1107 = torch.aten.mul.Tensor %15, %1106 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1107, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_391 = torch.constant.int 6
    %1108 = torch.prims.convert_element_type %1107, %int6_391 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1108, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1109 = torch.aten.div.Tensor %1108, %16 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1109, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_392 = torch.constant.float -2.400000e+02
    %float2.400000e02_393 = torch.constant.float 2.400000e+02
    %1110 = torch.aten.clamp %1109, %float-2.400000e02_392, %float2.400000e02_393 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1110, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_394 = torch.constant.int 26
    %1111 = torch.prims.convert_element_type %1110, %int26_394 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1111, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_395 = torch.constant.int 0
    %1112 = torch.aten.unsqueeze %17, %int0_395 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_396 = torch.constant.int 4
    %int14336 = torch.constant.int 14336
    %int4096_397 = torch.constant.int 4096
    %1113 = torch.prim.ListConstruct %int4_396, %int14336, %int4096_397 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_398 = torch.constant.bool false
    %1114 = torch.aten.expand %1112, %1113, %false_398 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %1115 = torch_c.to_builtin_tensor %1111 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1116 = torch_c.to_builtin_tensor %1114 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %1117 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1115, %1116) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %1118 = torch_c.from_builtin_tensor %1117 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1118, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1119 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1120 = torch.aten.permute %18, %1119 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1121 = torch.aten.mul.Tensor %16, %1120 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_399 = torch.constant.int 6
    %1122 = torch.prims.convert_element_type %1118, %int6_399 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1122, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1123 = torch.aten.mul.Tensor %1122, %1121 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1123, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1124 = torch.aten.silu %1123 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1124, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1125 = torch.aten.div.Tensor %1108, %19 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1125, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_400 = torch.constant.float -2.400000e+02
    %float2.400000e02_401 = torch.constant.float 2.400000e+02
    %1126 = torch.aten.clamp %1125, %float-2.400000e02_400, %float2.400000e02_401 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1126, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_402 = torch.constant.int 26
    %1127 = torch.prims.convert_element_type %1126, %int26_402 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1127, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_403 = torch.constant.int 0
    %1128 = torch.aten.unsqueeze %20, %int0_403 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_404 = torch.constant.int 4
    %int14336_405 = torch.constant.int 14336
    %int4096_406 = torch.constant.int 4096
    %1129 = torch.prim.ListConstruct %int4_404, %int14336_405, %int4096_406 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_407 = torch.constant.bool false
    %1130 = torch.aten.expand %1128, %1129, %false_407 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %1131 = torch_c.to_builtin_tensor %1127 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1132 = torch_c.to_builtin_tensor %1130 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %1133 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1131, %1132) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %1134 = torch_c.from_builtin_tensor %1133 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1134, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1135 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1136 = torch.aten.permute %21, %1135 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1137 = torch.aten.mul.Tensor %19, %1136 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_408 = torch.constant.int 6
    %1138 = torch.prims.convert_element_type %1134, %int6_408 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1138, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1139 = torch.aten.mul.Tensor %1138, %1137 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1139, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1140 = torch.aten.mul.Tensor %1124, %1139 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1140, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1141 = torch.aten.div.Tensor %1140, %22 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1141, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %float-2.400000e02_409 = torch.constant.float -2.400000e+02
    %float2.400000e02_410 = torch.constant.float 2.400000e+02
    %1142 = torch.aten.clamp %1141, %float-2.400000e02_409, %float2.400000e02_410 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1142, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %int26_411 = torch.constant.int 26
    %1143 = torch.prims.convert_element_type %1142, %int26_411 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1143, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    %int0_412 = torch.constant.int 0
    %1144 = torch.aten.unsqueeze %23, %int0_412 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %int4_413 = torch.constant.int 4
    %int4096_414 = torch.constant.int 4096
    %int14336_415 = torch.constant.int 14336
    %1145 = torch.prim.ListConstruct %int4_413, %int4096_414, %int14336_415 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_416 = torch.constant.bool false
    %1146 = torch.aten.expand %1144, %1145, %false_416 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %1147 = torch_c.to_builtin_tensor %1143 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ>
    %1148 = torch_c.to_builtin_tensor %1146 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ>
    %1149 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%1147, %1148) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1150 = torch_c.from_builtin_tensor %1149 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1150, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1151 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1152 = torch.aten.permute %24, %1151 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1153 = torch.aten.mul.Tensor %22, %1152 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_417 = torch.constant.int 6
    %1154 = torch.prims.convert_element_type %1150, %int6_417 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1154, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1155 = torch.aten.mul.Tensor %1154, %1153 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1155, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_418 = torch.constant.int 1
    %1156 = torch.aten.add.Tensor %1098, %1155, %int1_418 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1156, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_419 = torch.constant.int 6
    %1157 = torch.prims.convert_element_type %1156, %int6_419 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1157, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_420 = torch.constant.int 2
    %1158 = torch.aten.pow.Tensor_Scalar %1157, %int2_420 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1158, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_421 = torch.constant.int -1
    %1159 = torch.prim.ListConstruct %int-1_421 : (!torch.int) -> !torch.list<int>
    %true_422 = torch.constant.bool true
    %none_423 = torch.constant.none
    %1160 = torch.aten.mean.dim %1158, %1159, %true_422, %none_423 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1160, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_424 = torch.constant.float 1.000000e-05
    %int1_425 = torch.constant.int 1
    %1161 = torch.aten.add.Scalar %1160, %float1.000000e-05_424, %int1_425 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1161, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1162 = torch.aten.rsqrt %1161 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1162, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1163 = torch.aten.mul.Tensor %1157, %1162 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1163, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_426 = torch.constant.int 6
    %1164 = torch.prims.convert_element_type %1163, %int6_426 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1164, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1165 = torch.aten.mul.Tensor %25, %1164 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1165, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_427 = torch.constant.int 6
    %1166 = torch.prims.convert_element_type %1165, %int6_427 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1166, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1167 = torch.aten.div.Tensor %1166, %26 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1167, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_428 = torch.constant.float -2.400000e+02
    %float2.400000e02_429 = torch.constant.float 2.400000e+02
    %1168 = torch.aten.clamp %1167, %float-2.400000e02_428, %float2.400000e02_429 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1168, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_430 = torch.constant.int 26
    %1169 = torch.prims.convert_element_type %1168, %int26_430 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1169, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_431 = torch.constant.int 0
    %1170 = torch.aten.unsqueeze %27, %int0_431 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_432 = torch.constant.int 4
    %int4096_433 = torch.constant.int 4096
    %int4096_434 = torch.constant.int 4096
    %1171 = torch.prim.ListConstruct %int4_432, %int4096_433, %int4096_434 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_435 = torch.constant.bool false
    %1172 = torch.aten.expand %1170, %1171, %false_435 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %1173 = torch_c.to_builtin_tensor %1169 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1174 = torch_c.to_builtin_tensor %1172 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %1175 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1173, %1174) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1176 = torch_c.from_builtin_tensor %1175 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1176, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1177 = torch.aten.div.Tensor %1176, %28 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1177, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_436 = torch.constant.float -2.400000e+02
    %float2.400000e02_437 = torch.constant.float 2.400000e+02
    %1178 = torch.aten.clamp %1177, %float-2.400000e02_436, %float2.400000e02_437 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1178, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_438 = torch.constant.int 26
    %1179 = torch.prims.convert_element_type %1178, %int26_438 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1179, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %1180 = torch.aten.div.Tensor %1166, %29 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1180, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_439 = torch.constant.float -2.400000e+02
    %float2.400000e02_440 = torch.constant.float 2.400000e+02
    %1181 = torch.aten.clamp %1180, %float-2.400000e02_439, %float2.400000e02_440 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1181, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_441 = torch.constant.int 26
    %1182 = torch.prims.convert_element_type %1181, %int26_441 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1182, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_442 = torch.constant.int 0
    %1183 = torch.aten.unsqueeze %30, %int0_442 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_443 = torch.constant.int 4
    %int1024_444 = torch.constant.int 1024
    %int4096_445 = torch.constant.int 4096
    %1184 = torch.prim.ListConstruct %int4_443, %int1024_444, %int4096_445 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_446 = torch.constant.bool false
    %1185 = torch.aten.expand %1183, %1184, %false_446 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1186 = torch_c.to_builtin_tensor %1182 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1187 = torch_c.to_builtin_tensor %1185 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1188 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1186, %1187) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %1189 = torch_c.from_builtin_tensor %1188 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1189, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %1190 = torch.aten.div.Tensor %1189, %31 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1190, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_447 = torch.constant.float -2.400000e+02
    %float2.400000e02_448 = torch.constant.float 2.400000e+02
    %1191 = torch.aten.clamp %1190, %float-2.400000e02_447, %float2.400000e02_448 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1191, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_449 = torch.constant.int 26
    %1192 = torch.prims.convert_element_type %1191, %int26_449 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1192, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %1193 = torch.aten.div.Tensor %1166, %32 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1193, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_450 = torch.constant.float -2.400000e+02
    %float2.400000e02_451 = torch.constant.float 2.400000e+02
    %1194 = torch.aten.clamp %1193, %float-2.400000e02_450, %float2.400000e02_451 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1194, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_452 = torch.constant.int 26
    %1195 = torch.prims.convert_element_type %1194, %int26_452 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1195, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_453 = torch.constant.int 0
    %1196 = torch.aten.unsqueeze %33, %int0_453 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_454 = torch.constant.int 4
    %int1024_455 = torch.constant.int 1024
    %int4096_456 = torch.constant.int 4096
    %1197 = torch.prim.ListConstruct %int4_454, %int1024_455, %int4096_456 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_457 = torch.constant.bool false
    %1198 = torch.aten.expand %1196, %1197, %false_457 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1199 = torch_c.to_builtin_tensor %1195 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1200 = torch_c.to_builtin_tensor %1198 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1201 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1199, %1200) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %1202 = torch_c.from_builtin_tensor %1201 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1202, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %1203 = torch.aten.div.Tensor %1202, %34 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1203, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_458 = torch.constant.float -2.400000e+02
    %float2.400000e02_459 = torch.constant.float 2.400000e+02
    %1204 = torch.aten.clamp %1203, %float-2.400000e02_458, %float2.400000e02_459 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1204, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_460 = torch.constant.int 26
    %1205 = torch.prims.convert_element_type %1204, %int26_460 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1205, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %int4_461 = torch.constant.int 4
    %int32_462 = torch.constant.int 32
    %int128_463 = torch.constant.int 128
    %1206 = torch.prim.ListConstruct %int4_461, %777, %int32_462, %int128_463 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1207 = torch.aten.view %1179, %1206 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1207, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int4_464 = torch.constant.int 4
    %int8_465 = torch.constant.int 8
    %int128_466 = torch.constant.int 128
    %1208 = torch.prim.ListConstruct %int4_464, %777, %int8_465, %int128_466 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1209 = torch.aten.view %1192, %1208 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1209, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int4_467 = torch.constant.int 4
    %int8_468 = torch.constant.int 8
    %int128_469 = torch.constant.int 128
    %1210 = torch.prim.ListConstruct %int4_467, %777, %int8_468, %int128_469 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1211 = torch.aten.view %1205, %1210 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1211, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int131072_470 = torch.constant.int 131072
    %none_471 = torch.constant.none
    %none_472 = torch.constant.none
    %cpu_473 = torch.constant.device "cpu"
    %false_474 = torch.constant.bool false
    %1212 = torch.aten.arange %int131072_470, %none_471, %none_472, %cpu_473, %false_474 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_475 = torch.constant.int 0
    %int128_476 = torch.constant.int 128
    %int2_477 = torch.constant.int 2
    %int4_478 = torch.constant.int 4
    %none_479 = torch.constant.none
    %cpu_480 = torch.constant.device "cpu"
    %false_481 = torch.constant.bool false
    %1213 = torch.aten.arange.start_step %int0_475, %int128_476, %int2_477, %int4_478, %none_479, %cpu_480, %false_481 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_482 = torch.constant.int 6
    %1214 = torch.prims.convert_element_type %1213, %int6_482 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_483 = torch.constant.int 128
    %1215 = torch.aten.div.Scalar %1214, %int128_483 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_484 = torch.constant.float 5.000000e+05
    %1216 = torch.aten.pow.Scalar %float5.000000e05_484, %1215 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1217 = torch.aten.reciprocal %1216 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_485 = torch.constant.float 1.000000e+00
    %1218 = torch.aten.mul.Scalar %1217, %float1.000000e00_485 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1219 = torch.aten.reciprocal %1218 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_486 = torch.constant.float 6.2831853071795862
    %1220 = torch.aten.mul.Scalar %1219, %float6.283190e00_486 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_487 = torch.constant.float 8.192000e+03
    %1221 = torch.aten.gt.Scalar %1220, %float8.192000e03_487 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_488 = torch.constant.int 8
    %1222 = torch.aten.div.Scalar %1218, %int8_488 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1223 = torch.aten.where.self %1221, %1222, %1218 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1224 = torch.aten.reciprocal %1220 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_489 = torch.constant.int 8192
    %1225 = torch.aten.mul.Scalar %1224, %int8192_489 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_490 = torch.constant.int 1
    %int1_491 = torch.constant.int 1
    %1226 = torch.aten.sub.Scalar %1225, %int1_490, %int1_491 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_492 = torch.constant.int 3
    %1227 = torch.aten.div.Scalar %1226, %int3_492 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_493 = torch.constant.int 1
    %int1_494 = torch.constant.int 1
    %1228 = torch.aten.rsub.Scalar %1227, %int1_493, %int1_494 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1229 = torch.aten.mul.Tensor %1228, %1223 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_495 = torch.constant.int 8
    %1230 = torch.aten.div.Scalar %1229, %int8_495 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1231 = torch.aten.mul.Tensor %1227, %1223 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_496 = torch.constant.int 1
    %1232 = torch.aten.add.Tensor %1230, %1231, %int1_496 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_497 = torch.constant.float 2.048000e+03
    %1233 = torch.aten.lt.Scalar %1220, %float2.048000e03_497 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1234 = torch.aten.bitwise_not %1233 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_498 = torch.constant.float 8.192000e+03
    %1235 = torch.aten.gt.Scalar %1220, %float8.192000e03_498 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1236 = torch.aten.bitwise_not %1235 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1237 = torch.aten.mul.Tensor %1234, %1236 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1238 = torch.aten.where.self %1237, %1232, %1223 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1239 = torch.prim.ListConstruct %1238, %1238 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_499 = torch.constant.int -1
    %1240 = torch.aten.cat %1239, %int-1_499 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_500 = torch.constant.int 6
    %1241 = torch.prims.convert_element_type %1240, %int6_500 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_501 = torch.constant.int 1
    %1242 = torch.aten.unsqueeze %1212, %int1_501 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_502 = torch.constant.int 6
    %1243 = torch.prims.convert_element_type %1242, %int6_502 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_503 = torch.constant.int 0
    %1244 = torch.aten.unsqueeze %1241, %int0_503 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_504 = torch.constant.int 6
    %1245 = torch.prims.convert_element_type %1244, %int6_504 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %1246 = torch.aten.mul.Tensor %1243, %1245 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %1247 = torch.aten.cos %1246 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_505 = torch.constant.int 15
    %1248 = torch.prims.convert_element_type %1247, %int15_505 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1249 = torch.aten.sin %1246 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_506 = torch.constant.int 15
    %1250 = torch.prims.convert_element_type %1249, %int15_506 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_507 = torch.constant.int 0
    %int0_508 = torch.constant.int 0
    %int1_509 = torch.constant.int 1
    %1251 = torch.aten.slice.Tensor %1248, %int0_507, %int0_508, %777, %int1_509 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1251, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_510 = torch.constant.int 1
    %int0_511 = torch.constant.int 0
    %int9223372036854775807_512 = torch.constant.int 9223372036854775807
    %int1_513 = torch.constant.int 1
    %1252 = torch.aten.slice.Tensor %1251, %int1_510, %int0_511, %int9223372036854775807_512, %int1_513 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1252, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_514 = torch.constant.int 0
    %int0_515 = torch.constant.int 0
    %int1_516 = torch.constant.int 1
    %1253 = torch.aten.slice.Tensor %1250, %int0_514, %int0_515, %777, %int1_516 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1253, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_517 = torch.constant.int 1
    %int0_518 = torch.constant.int 0
    %int9223372036854775807_519 = torch.constant.int 9223372036854775807
    %int1_520 = torch.constant.int 1
    %1254 = torch.aten.slice.Tensor %1253, %int1_517, %int0_518, %int9223372036854775807_519, %int1_520 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1254, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_521 = torch.constant.int 0
    %1255 = torch.aten.unsqueeze %1252, %int0_521 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1255, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_522 = torch.constant.int 1
    %int0_523 = torch.constant.int 0
    %int9223372036854775807_524 = torch.constant.int 9223372036854775807
    %int1_525 = torch.constant.int 1
    %1256 = torch.aten.slice.Tensor %1255, %int1_522, %int0_523, %int9223372036854775807_524, %int1_525 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1256, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_526 = torch.constant.int 2
    %1257 = torch.aten.unsqueeze %1256, %int2_526 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1257, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_527 = torch.constant.int 3
    %int0_528 = torch.constant.int 0
    %int9223372036854775807_529 = torch.constant.int 9223372036854775807
    %int1_530 = torch.constant.int 1
    %1258 = torch.aten.slice.Tensor %1257, %int3_527, %int0_528, %int9223372036854775807_529, %int1_530 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1258, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_531 = torch.constant.int 4
    %int1_532 = torch.constant.int 1
    %int1_533 = torch.constant.int 1
    %int1_534 = torch.constant.int 1
    %1259 = torch.prim.ListConstruct %int4_531, %int1_532, %int1_533, %int1_534 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1260 = torch.aten.repeat %1258, %1259 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1260, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_535 = torch.constant.int 0
    %1261 = torch.aten.unsqueeze %1254, %int0_535 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1261, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_536 = torch.constant.int 1
    %int0_537 = torch.constant.int 0
    %int9223372036854775807_538 = torch.constant.int 9223372036854775807
    %int1_539 = torch.constant.int 1
    %1262 = torch.aten.slice.Tensor %1261, %int1_536, %int0_537, %int9223372036854775807_538, %int1_539 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1262, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_540 = torch.constant.int 2
    %1263 = torch.aten.unsqueeze %1262, %int2_540 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1263, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_541 = torch.constant.int 3
    %int0_542 = torch.constant.int 0
    %int9223372036854775807_543 = torch.constant.int 9223372036854775807
    %int1_544 = torch.constant.int 1
    %1264 = torch.aten.slice.Tensor %1263, %int3_541, %int0_542, %int9223372036854775807_543, %int1_544 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1264, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_545 = torch.constant.int 4
    %int1_546 = torch.constant.int 1
    %int1_547 = torch.constant.int 1
    %int1_548 = torch.constant.int 1
    %1265 = torch.prim.ListConstruct %int4_545, %int1_546, %int1_547, %int1_548 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1266 = torch.aten.repeat %1264, %1265 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1266, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %1267 = torch.aten.mul.Tensor %1207, %1260 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1267, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int3_549 = torch.constant.int 3
    %int0_550 = torch.constant.int 0
    %int64_551 = torch.constant.int 64
    %int1_552 = torch.constant.int 1
    %1268 = torch.aten.slice.Tensor %1207, %int3_549, %int0_550, %int64_551, %int1_552 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1268, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %int3_553 = torch.constant.int 3
    %int64_554 = torch.constant.int 64
    %int9223372036854775807_555 = torch.constant.int 9223372036854775807
    %int1_556 = torch.constant.int 1
    %1269 = torch.aten.slice.Tensor %1207, %int3_553, %int64_554, %int9223372036854775807_555, %int1_556 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1269, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %1270 = torch.aten.neg %1269 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1270, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %1271 = torch.prim.ListConstruct %1270, %1268 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_557 = torch.constant.int -1
    %1272 = torch.aten.cat %1271, %int-1_557 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1272, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %1273 = torch.aten.mul.Tensor %1272, %1266 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1273, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_558 = torch.constant.int 1
    %1274 = torch.aten.add.Tensor %1267, %1273, %int1_558 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1274, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int131072_559 = torch.constant.int 131072
    %none_560 = torch.constant.none
    %none_561 = torch.constant.none
    %cpu_562 = torch.constant.device "cpu"
    %false_563 = torch.constant.bool false
    %1275 = torch.aten.arange %int131072_559, %none_560, %none_561, %cpu_562, %false_563 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_564 = torch.constant.int 0
    %int128_565 = torch.constant.int 128
    %int2_566 = torch.constant.int 2
    %int4_567 = torch.constant.int 4
    %none_568 = torch.constant.none
    %cpu_569 = torch.constant.device "cpu"
    %false_570 = torch.constant.bool false
    %1276 = torch.aten.arange.start_step %int0_564, %int128_565, %int2_566, %int4_567, %none_568, %cpu_569, %false_570 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_571 = torch.constant.int 6
    %1277 = torch.prims.convert_element_type %1276, %int6_571 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_572 = torch.constant.int 128
    %1278 = torch.aten.div.Scalar %1277, %int128_572 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_573 = torch.constant.float 5.000000e+05
    %1279 = torch.aten.pow.Scalar %float5.000000e05_573, %1278 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1280 = torch.aten.reciprocal %1279 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_574 = torch.constant.float 1.000000e+00
    %1281 = torch.aten.mul.Scalar %1280, %float1.000000e00_574 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1282 = torch.aten.reciprocal %1281 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_575 = torch.constant.float 6.2831853071795862
    %1283 = torch.aten.mul.Scalar %1282, %float6.283190e00_575 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_576 = torch.constant.float 8.192000e+03
    %1284 = torch.aten.gt.Scalar %1283, %float8.192000e03_576 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_577 = torch.constant.int 8
    %1285 = torch.aten.div.Scalar %1281, %int8_577 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1286 = torch.aten.where.self %1284, %1285, %1281 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1287 = torch.aten.reciprocal %1283 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_578 = torch.constant.int 8192
    %1288 = torch.aten.mul.Scalar %1287, %int8192_578 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_579 = torch.constant.int 1
    %int1_580 = torch.constant.int 1
    %1289 = torch.aten.sub.Scalar %1288, %int1_579, %int1_580 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_581 = torch.constant.int 3
    %1290 = torch.aten.div.Scalar %1289, %int3_581 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_582 = torch.constant.int 1
    %int1_583 = torch.constant.int 1
    %1291 = torch.aten.rsub.Scalar %1290, %int1_582, %int1_583 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1292 = torch.aten.mul.Tensor %1291, %1286 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_584 = torch.constant.int 8
    %1293 = torch.aten.div.Scalar %1292, %int8_584 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1294 = torch.aten.mul.Tensor %1290, %1286 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_585 = torch.constant.int 1
    %1295 = torch.aten.add.Tensor %1293, %1294, %int1_585 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_586 = torch.constant.float 2.048000e+03
    %1296 = torch.aten.lt.Scalar %1283, %float2.048000e03_586 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1297 = torch.aten.bitwise_not %1296 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_587 = torch.constant.float 8.192000e+03
    %1298 = torch.aten.gt.Scalar %1283, %float8.192000e03_587 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1299 = torch.aten.bitwise_not %1298 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1300 = torch.aten.mul.Tensor %1297, %1299 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1301 = torch.aten.where.self %1300, %1295, %1286 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1302 = torch.prim.ListConstruct %1301, %1301 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_588 = torch.constant.int -1
    %1303 = torch.aten.cat %1302, %int-1_588 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_589 = torch.constant.int 6
    %1304 = torch.prims.convert_element_type %1303, %int6_589 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_590 = torch.constant.int 1
    %1305 = torch.aten.unsqueeze %1275, %int1_590 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_591 = torch.constant.int 6
    %1306 = torch.prims.convert_element_type %1305, %int6_591 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_592 = torch.constant.int 0
    %1307 = torch.aten.unsqueeze %1304, %int0_592 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_593 = torch.constant.int 6
    %1308 = torch.prims.convert_element_type %1307, %int6_593 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %1309 = torch.aten.mul.Tensor %1306, %1308 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %1310 = torch.aten.cos %1309 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_594 = torch.constant.int 15
    %1311 = torch.prims.convert_element_type %1310, %int15_594 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1312 = torch.aten.sin %1309 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_595 = torch.constant.int 15
    %1313 = torch.prims.convert_element_type %1312, %int15_595 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_596 = torch.constant.int 0
    %int0_597 = torch.constant.int 0
    %int1_598 = torch.constant.int 1
    %1314 = torch.aten.slice.Tensor %1311, %int0_596, %int0_597, %777, %int1_598 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1314, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_599 = torch.constant.int 1
    %int0_600 = torch.constant.int 0
    %int9223372036854775807_601 = torch.constant.int 9223372036854775807
    %int1_602 = torch.constant.int 1
    %1315 = torch.aten.slice.Tensor %1314, %int1_599, %int0_600, %int9223372036854775807_601, %int1_602 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1315, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_603 = torch.constant.int 0
    %int0_604 = torch.constant.int 0
    %int1_605 = torch.constant.int 1
    %1316 = torch.aten.slice.Tensor %1313, %int0_603, %int0_604, %777, %int1_605 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1316, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_606 = torch.constant.int 1
    %int0_607 = torch.constant.int 0
    %int9223372036854775807_608 = torch.constant.int 9223372036854775807
    %int1_609 = torch.constant.int 1
    %1317 = torch.aten.slice.Tensor %1316, %int1_606, %int0_607, %int9223372036854775807_608, %int1_609 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1317, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_610 = torch.constant.int 0
    %1318 = torch.aten.unsqueeze %1315, %int0_610 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1318, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_611 = torch.constant.int 1
    %int0_612 = torch.constant.int 0
    %int9223372036854775807_613 = torch.constant.int 9223372036854775807
    %int1_614 = torch.constant.int 1
    %1319 = torch.aten.slice.Tensor %1318, %int1_611, %int0_612, %int9223372036854775807_613, %int1_614 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1319, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_615 = torch.constant.int 2
    %1320 = torch.aten.unsqueeze %1319, %int2_615 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1320, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_616 = torch.constant.int 3
    %int0_617 = torch.constant.int 0
    %int9223372036854775807_618 = torch.constant.int 9223372036854775807
    %int1_619 = torch.constant.int 1
    %1321 = torch.aten.slice.Tensor %1320, %int3_616, %int0_617, %int9223372036854775807_618, %int1_619 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1321, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_620 = torch.constant.int 4
    %int1_621 = torch.constant.int 1
    %int1_622 = torch.constant.int 1
    %int1_623 = torch.constant.int 1
    %1322 = torch.prim.ListConstruct %int4_620, %int1_621, %int1_622, %int1_623 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1323 = torch.aten.repeat %1321, %1322 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1323, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_624 = torch.constant.int 0
    %1324 = torch.aten.unsqueeze %1317, %int0_624 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1324, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_625 = torch.constant.int 1
    %int0_626 = torch.constant.int 0
    %int9223372036854775807_627 = torch.constant.int 9223372036854775807
    %int1_628 = torch.constant.int 1
    %1325 = torch.aten.slice.Tensor %1324, %int1_625, %int0_626, %int9223372036854775807_627, %int1_628 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1325, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_629 = torch.constant.int 2
    %1326 = torch.aten.unsqueeze %1325, %int2_629 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1326, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_630 = torch.constant.int 3
    %int0_631 = torch.constant.int 0
    %int9223372036854775807_632 = torch.constant.int 9223372036854775807
    %int1_633 = torch.constant.int 1
    %1327 = torch.aten.slice.Tensor %1326, %int3_630, %int0_631, %int9223372036854775807_632, %int1_633 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1327, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_634 = torch.constant.int 4
    %int1_635 = torch.constant.int 1
    %int1_636 = torch.constant.int 1
    %int1_637 = torch.constant.int 1
    %1328 = torch.prim.ListConstruct %int4_634, %int1_635, %int1_636, %int1_637 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1329 = torch.aten.repeat %1327, %1328 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1329, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %1330 = torch.aten.mul.Tensor %1209, %1323 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1330, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int3_638 = torch.constant.int 3
    %int0_639 = torch.constant.int 0
    %int64_640 = torch.constant.int 64
    %int1_641 = torch.constant.int 1
    %1331 = torch.aten.slice.Tensor %1209, %int3_638, %int0_639, %int64_640, %int1_641 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1331, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %int3_642 = torch.constant.int 3
    %int64_643 = torch.constant.int 64
    %int9223372036854775807_644 = torch.constant.int 9223372036854775807
    %int1_645 = torch.constant.int 1
    %1332 = torch.aten.slice.Tensor %1209, %int3_642, %int64_643, %int9223372036854775807_644, %int1_645 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1332, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %1333 = torch.aten.neg %1332 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1333, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %1334 = torch.prim.ListConstruct %1333, %1331 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_646 = torch.constant.int -1
    %1335 = torch.aten.cat %1334, %int-1_646 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1335, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %1336 = torch.aten.mul.Tensor %1335, %1329 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1336, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int1_647 = torch.constant.int 1
    %1337 = torch.aten.add.Tensor %1330, %1336, %int1_647 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1337, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int64_648 = torch.constant.int 64
    %1338 = torch.aten.mul.Scalar %arg2, %int64_648 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %1338, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int2_649 = torch.constant.int 2
    %int1_650 = torch.constant.int 1
    %1339 = torch.aten.add.Scalar %1338, %int2_649, %int1_650 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %1339, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_651 = torch.constant.int 4
    %int32_652 = torch.constant.int 32
    %int8_653 = torch.constant.int 8
    %int128_654 = torch.constant.int 128
    %1340 = torch.prim.ListConstruct %int4_651, %775, %int32_652, %int8_653, %int128_654 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1341 = torch.aten.view %1337, %1340 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1341, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_655 = torch.constant.int 32
    %int8_656 = torch.constant.int 8
    %int128_657 = torch.constant.int 128
    %1342 = torch.prim.ListConstruct %997, %int32_655, %int8_656, %int128_657 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1343 = torch.aten.view %1341, %1342 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1343, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1344 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %1345 = torch.aten.view %1339, %1344 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1345, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_658 = torch.constant.int 26
    %1346 = torch.prims.convert_element_type %1343, %int26_658 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1346, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_659 = torch.constant.int 1
    %1347 = torch.aten.view.dtype %1346, %int1_659 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1347, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1348 = torch.aten.detach %1347 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1348, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1349 = torch.aten.detach %1348 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1349, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_660 = torch.constant.int 32
    %int2_661 = torch.constant.int 2
    %int32_662 = torch.constant.int 32
    %int8_663 = torch.constant.int 8
    %int128_664 = torch.constant.int 128
    %1350 = torch.prim.ListConstruct %776, %int32_660, %int2_661, %int32_662, %int8_663, %int128_664 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1351 = torch.aten.view %1044, %1350 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1351, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_665 = torch.constant.int 32
    %int8_666 = torch.constant.int 8
    %int128_667 = torch.constant.int 128
    %1352 = torch.prim.ListConstruct %990, %int32_665, %int8_666, %int128_667 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1353 = torch.aten.view %1351, %1352 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1353, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_668 = torch.constant.int 1
    %1354 = torch.aten.view.dtype %1353, %int1_668 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1354, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1355 = torch.aten.detach %1354 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1355, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1356 = torch.aten.detach %1355 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1356, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1357 = torch.prim.ListConstruct %1345 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_669 = torch.constant.bool false
    %1358 = torch.aten.index_put %1356, %1357, %1349, %false_669 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1358, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_670 = torch.constant.int 26
    %1359 = torch.aten.view.dtype %1358, %int26_670 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1359, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1360 = torch.aten.detach %1359 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1360, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1361 = torch.aten.detach %1360 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1361, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_671 = torch.constant.int 32
    %int2_672 = torch.constant.int 2
    %int32_673 = torch.constant.int 32
    %int8_674 = torch.constant.int 8
    %int128_675 = torch.constant.int 128
    %1362 = torch.prim.ListConstruct %776, %int32_671, %int2_672, %int32_673, %int8_674, %int128_675 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1363 = torch.aten.view %1361, %1362 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1363, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_676 = torch.constant.int 2097152
    %1364 = torch.prim.ListConstruct %776, %int2097152_676 : (!torch.int, !torch.int) -> !torch.list<int>
    %1365 = torch.aten.view %1363, %1364 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1365, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int4_677 = torch.constant.int 4
    %int32_678 = torch.constant.int 32
    %int8_679 = torch.constant.int 8
    %int128_680 = torch.constant.int 128
    %1366 = torch.prim.ListConstruct %int4_677, %775, %int32_678, %int8_679, %int128_680 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1367 = torch.aten.view %1211, %1366 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1367, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_681 = torch.constant.int 32
    %int8_682 = torch.constant.int 8
    %int128_683 = torch.constant.int 128
    %1368 = torch.prim.ListConstruct %997, %int32_681, %int8_682, %int128_683 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1369 = torch.aten.view %1367, %1368 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1369, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_684 = torch.constant.int 1
    %int1_685 = torch.constant.int 1
    %1370 = torch.aten.add.Scalar %1339, %int1_684, %int1_685 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %1370, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %1371 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %1372 = torch.aten.view %1370, %1371 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1372, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_686 = torch.constant.int 26
    %1373 = torch.prims.convert_element_type %1369, %int26_686 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1373, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_687 = torch.constant.int 1
    %1374 = torch.aten.view.dtype %1373, %int1_687 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1374, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1375 = torch.aten.detach %1374 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1375, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1376 = torch.aten.detach %1375 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1376, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_688 = torch.constant.int 32
    %int2_689 = torch.constant.int 2
    %int32_690 = torch.constant.int 32
    %int8_691 = torch.constant.int 8
    %int128_692 = torch.constant.int 128
    %1377 = torch.prim.ListConstruct %776, %int32_688, %int2_689, %int32_690, %int8_691, %int128_692 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1378 = torch.aten.view %1365, %1377 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1378, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_693 = torch.constant.int 32
    %int8_694 = torch.constant.int 8
    %int128_695 = torch.constant.int 128
    %1379 = torch.prim.ListConstruct %990, %int32_693, %int8_694, %int128_695 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1380 = torch.aten.view %1378, %1379 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1380, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_696 = torch.constant.int 1
    %1381 = torch.aten.view.dtype %1380, %int1_696 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1381, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1382 = torch.aten.detach %1381 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1382, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1383 = torch.aten.detach %1382 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1383, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1384 = torch.prim.ListConstruct %1372 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_697 = torch.constant.bool false
    %1385 = torch.aten.index_put %1383, %1384, %1376, %false_697 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1385, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_698 = torch.constant.int 26
    %1386 = torch.aten.view.dtype %1385, %int26_698 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1386, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1387 = torch.aten.detach %1386 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1387, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1388 = torch.aten.detach %1387 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1388, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_699 = torch.constant.int 32
    %int2_700 = torch.constant.int 2
    %int32_701 = torch.constant.int 32
    %int8_702 = torch.constant.int 8
    %int128_703 = torch.constant.int 128
    %1389 = torch.prim.ListConstruct %776, %int32_699, %int2_700, %int32_701, %int8_702, %int128_703 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1390 = torch.aten.view %1388, %1389 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1390, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_704 = torch.constant.int 2097152
    %1391 = torch.prim.ListConstruct %776, %int2097152_704 : (!torch.int, !torch.int) -> !torch.list<int>
    %1392 = torch.aten.view %1390, %1391 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1392, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_705 = torch.constant.int -2
    %1393 = torch.aten.unsqueeze %1337, %int-2_705 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1393, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_706 = torch.constant.int 4
    %int8_707 = torch.constant.int 8
    %int4_708 = torch.constant.int 4
    %int128_709 = torch.constant.int 128
    %1394 = torch.prim.ListConstruct %int4_706, %777, %int8_707, %int4_708, %int128_709 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_710 = torch.constant.bool false
    %1395 = torch.aten.expand %1393, %1394, %false_710 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1395, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_711 = torch.constant.int 0
    %1396 = torch.aten.clone %1395, %int0_711 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1396, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_712 = torch.constant.int 4
    %int32_713 = torch.constant.int 32
    %int128_714 = torch.constant.int 128
    %1397 = torch.prim.ListConstruct %int4_712, %777, %int32_713, %int128_714 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1398 = torch.aten._unsafe_view %1396, %1397 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1398, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int-2_715 = torch.constant.int -2
    %1399 = torch.aten.unsqueeze %1211, %int-2_715 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1399, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_716 = torch.constant.int 4
    %int8_717 = torch.constant.int 8
    %int4_718 = torch.constant.int 4
    %int128_719 = torch.constant.int 128
    %1400 = torch.prim.ListConstruct %int4_716, %777, %int8_717, %int4_718, %int128_719 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_720 = torch.constant.bool false
    %1401 = torch.aten.expand %1399, %1400, %false_720 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1401, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_721 = torch.constant.int 0
    %1402 = torch.aten.clone %1401, %int0_721 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1402, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_722 = torch.constant.int 4
    %int32_723 = torch.constant.int 32
    %int128_724 = torch.constant.int 128
    %1403 = torch.prim.ListConstruct %int4_722, %777, %int32_723, %int128_724 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1404 = torch.aten._unsafe_view %1402, %1403 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1404, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_725 = torch.constant.int 1
    %int2_726 = torch.constant.int 2
    %1405 = torch.aten.transpose.int %1274, %int1_725, %int2_726 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1405, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_727 = torch.constant.int 1
    %int2_728 = torch.constant.int 2
    %1406 = torch.aten.transpose.int %1398, %int1_727, %int2_728 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1406, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_729 = torch.constant.int 1
    %int2_730 = torch.constant.int 2
    %1407 = torch.aten.transpose.int %1404, %int1_729, %int2_730 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1407, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_731 = torch.constant.int 26
    %1408 = torch.prims.convert_element_type %1405, %int26_731 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1408, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_732 = torch.constant.int 26
    %1409 = torch.prims.convert_element_type %1406, %int26_732 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1409, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_733 = torch.constant.int 26
    %1410 = torch.prims.convert_element_type %1407, %int26_733 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1410, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_734 = torch.constant.int 26
    %1411 = torch.prims.convert_element_type %803, %int26_734 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1411, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    %int0_735 = torch.constant.int 0
    %int0_736 = torch.constant.int 0
    %1412 = torch.aten.select.int %1411, %int0_735, %int0_736 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1412, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    %int0_737 = torch.constant.int 0
    %int0_738 = torch.constant.int 0
    %1413 = torch.aten.select.int %1412, %int0_737, %int0_738 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1413, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int0_739 = torch.constant.int 0
    %int0_740 = torch.constant.int 0
    %int9223372036854775807_741 = torch.constant.int 9223372036854775807
    %int1_742 = torch.constant.int 1
    %1414 = torch.aten.slice.Tensor %1413, %int0_739, %int0_740, %int9223372036854775807_741, %int1_742 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1414, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int1_743 = torch.constant.int 1
    %int0_744 = torch.constant.int 0
    %int9223372036854775807_745 = torch.constant.int 9223372036854775807
    %int1_746 = torch.constant.int 1
    %1415 = torch.aten.slice.Tensor %1414, %int1_743, %int0_744, %int9223372036854775807_745, %int1_746 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1415, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %none_747 = torch.constant.none
    %1416 = torch.aten.clone %35, %none_747 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
    %1417 = torch.aten.detach %1416 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1418 = torch.aten.detach %1417 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1419 = torch.aten.detach %1418 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1420 = torch_c.to_builtin_tensor %1408 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1421 = torch_c.to_builtin_tensor %1409 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1422 = torch_c.to_builtin_tensor %1410 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1423 = torch_c.to_builtin_tensor %1415 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ>
    %1424 = torch_c.to_builtin_tensor %1419 : !torch.vtensor<[],f32> -> tensor<f32>
    %1425 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%1420, %1421, %1422, %1424, %1423) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %1426 = torch_c.from_builtin_tensor %1425 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32>
    torch.bind_symbolic_shape %1426, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32>
    %int1_748 = torch.constant.int 1
    %int2_749 = torch.constant.int 2
    %1427 = torch.aten.transpose.int %1426, %int1_748, %int2_749 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %1427, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int0_750 = torch.constant.int 0
    %1428 = torch.aten.clone %1427, %int0_750 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %1428, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int4_751 = torch.constant.int 4
    %int4096_752 = torch.constant.int 4096
    %1429 = torch.prim.ListConstruct %int4_751, %777, %int4096_752 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1430 = torch.aten._unsafe_view %1428, %1429 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1430, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1431 = torch.aten.div.Tensor %1430, %36 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1431, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_753 = torch.constant.float -2.400000e+02
    %float2.400000e02_754 = torch.constant.float 2.400000e+02
    %1432 = torch.aten.clamp %1431, %float-2.400000e02_753, %float2.400000e02_754 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1432, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_755 = torch.constant.int 26
    %1433 = torch.prims.convert_element_type %1432, %int26_755 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1433, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_756 = torch.constant.int 0
    %1434 = torch.aten.unsqueeze %37, %int0_756 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_757 = torch.constant.int 4
    %int4096_758 = torch.constant.int 4096
    %int4096_759 = torch.constant.int 4096
    %1435 = torch.prim.ListConstruct %int4_757, %int4096_758, %int4096_759 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_760 = torch.constant.bool false
    %1436 = torch.aten.expand %1434, %1435, %false_760 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %1437 = torch_c.to_builtin_tensor %1433 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1438 = torch_c.to_builtin_tensor %1436 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %1439 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1437, %1438) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1440 = torch_c.from_builtin_tensor %1439 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1440, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1441 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1442 = torch.aten.permute %38, %1441 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1443 = torch.aten.mul.Tensor %36, %1442 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_761 = torch.constant.int 6
    %1444 = torch.prims.convert_element_type %1440, %int6_761 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1444, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1445 = torch.aten.mul.Tensor %1444, %1443 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1445, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_762 = torch.constant.int 1
    %1446 = torch.aten.add.Tensor %1156, %1445, %int1_762 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1446, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_763 = torch.constant.int 6
    %1447 = torch.prims.convert_element_type %1446, %int6_763 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1447, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_764 = torch.constant.int 2
    %1448 = torch.aten.pow.Tensor_Scalar %1447, %int2_764 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1448, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_765 = torch.constant.int -1
    %1449 = torch.prim.ListConstruct %int-1_765 : (!torch.int) -> !torch.list<int>
    %true_766 = torch.constant.bool true
    %none_767 = torch.constant.none
    %1450 = torch.aten.mean.dim %1448, %1449, %true_766, %none_767 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1450, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_768 = torch.constant.float 1.000000e-05
    %int1_769 = torch.constant.int 1
    %1451 = torch.aten.add.Scalar %1450, %float1.000000e-05_768, %int1_769 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1451, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1452 = torch.aten.rsqrt %1451 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1452, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1453 = torch.aten.mul.Tensor %1447, %1452 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1453, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_770 = torch.constant.int 6
    %1454 = torch.prims.convert_element_type %1453, %int6_770 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1454, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1455 = torch.aten.mul.Tensor %39, %1454 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1455, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_771 = torch.constant.int 6
    %1456 = torch.prims.convert_element_type %1455, %int6_771 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1456, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1457 = torch.aten.div.Tensor %1456, %40 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1457, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_772 = torch.constant.float -2.400000e+02
    %float2.400000e02_773 = torch.constant.float 2.400000e+02
    %1458 = torch.aten.clamp %1457, %float-2.400000e02_772, %float2.400000e02_773 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1458, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_774 = torch.constant.int 26
    %1459 = torch.prims.convert_element_type %1458, %int26_774 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1459, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_775 = torch.constant.int 0
    %1460 = torch.aten.unsqueeze %41, %int0_775 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_776 = torch.constant.int 4
    %int14336_777 = torch.constant.int 14336
    %int4096_778 = torch.constant.int 4096
    %1461 = torch.prim.ListConstruct %int4_776, %int14336_777, %int4096_778 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_779 = torch.constant.bool false
    %1462 = torch.aten.expand %1460, %1461, %false_779 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %1463 = torch_c.to_builtin_tensor %1459 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1464 = torch_c.to_builtin_tensor %1462 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %1465 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1463, %1464) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %1466 = torch_c.from_builtin_tensor %1465 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1466, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1467 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1468 = torch.aten.permute %42, %1467 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1469 = torch.aten.mul.Tensor %40, %1468 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_780 = torch.constant.int 6
    %1470 = torch.prims.convert_element_type %1466, %int6_780 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1470, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1471 = torch.aten.mul.Tensor %1470, %1469 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1471, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1472 = torch.aten.silu %1471 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1472, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1473 = torch.aten.div.Tensor %1456, %43 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1473, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_781 = torch.constant.float -2.400000e+02
    %float2.400000e02_782 = torch.constant.float 2.400000e+02
    %1474 = torch.aten.clamp %1473, %float-2.400000e02_781, %float2.400000e02_782 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1474, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_783 = torch.constant.int 26
    %1475 = torch.prims.convert_element_type %1474, %int26_783 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1475, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_784 = torch.constant.int 0
    %1476 = torch.aten.unsqueeze %44, %int0_784 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_785 = torch.constant.int 4
    %int14336_786 = torch.constant.int 14336
    %int4096_787 = torch.constant.int 4096
    %1477 = torch.prim.ListConstruct %int4_785, %int14336_786, %int4096_787 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_788 = torch.constant.bool false
    %1478 = torch.aten.expand %1476, %1477, %false_788 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %1479 = torch_c.to_builtin_tensor %1475 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1480 = torch_c.to_builtin_tensor %1478 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %1481 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1479, %1480) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %1482 = torch_c.from_builtin_tensor %1481 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1482, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1483 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1484 = torch.aten.permute %45, %1483 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1485 = torch.aten.mul.Tensor %43, %1484 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_789 = torch.constant.int 6
    %1486 = torch.prims.convert_element_type %1482, %int6_789 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1486, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1487 = torch.aten.mul.Tensor %1486, %1485 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1487, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1488 = torch.aten.mul.Tensor %1472, %1487 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1488, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1489 = torch.aten.div.Tensor %1488, %46 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1489, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %float-2.400000e02_790 = torch.constant.float -2.400000e+02
    %float2.400000e02_791 = torch.constant.float 2.400000e+02
    %1490 = torch.aten.clamp %1489, %float-2.400000e02_790, %float2.400000e02_791 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1490, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %int26_792 = torch.constant.int 26
    %1491 = torch.prims.convert_element_type %1490, %int26_792 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1491, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    %int0_793 = torch.constant.int 0
    %1492 = torch.aten.unsqueeze %47, %int0_793 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %int4_794 = torch.constant.int 4
    %int4096_795 = torch.constant.int 4096
    %int14336_796 = torch.constant.int 14336
    %1493 = torch.prim.ListConstruct %int4_794, %int4096_795, %int14336_796 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_797 = torch.constant.bool false
    %1494 = torch.aten.expand %1492, %1493, %false_797 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %1495 = torch_c.to_builtin_tensor %1491 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ>
    %1496 = torch_c.to_builtin_tensor %1494 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ>
    %1497 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%1495, %1496) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1498 = torch_c.from_builtin_tensor %1497 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1498, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1499 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1500 = torch.aten.permute %48, %1499 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1501 = torch.aten.mul.Tensor %46, %1500 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_798 = torch.constant.int 6
    %1502 = torch.prims.convert_element_type %1498, %int6_798 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1502, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1503 = torch.aten.mul.Tensor %1502, %1501 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1503, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_799 = torch.constant.int 1
    %1504 = torch.aten.add.Tensor %1446, %1503, %int1_799 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1504, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_800 = torch.constant.int 6
    %1505 = torch.prims.convert_element_type %1504, %int6_800 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1505, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_801 = torch.constant.int 2
    %1506 = torch.aten.pow.Tensor_Scalar %1505, %int2_801 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1506, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_802 = torch.constant.int -1
    %1507 = torch.prim.ListConstruct %int-1_802 : (!torch.int) -> !torch.list<int>
    %true_803 = torch.constant.bool true
    %none_804 = torch.constant.none
    %1508 = torch.aten.mean.dim %1506, %1507, %true_803, %none_804 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1508, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_805 = torch.constant.float 1.000000e-05
    %int1_806 = torch.constant.int 1
    %1509 = torch.aten.add.Scalar %1508, %float1.000000e-05_805, %int1_806 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1509, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1510 = torch.aten.rsqrt %1509 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1510, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1511 = torch.aten.mul.Tensor %1505, %1510 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1511, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_807 = torch.constant.int 6
    %1512 = torch.prims.convert_element_type %1511, %int6_807 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1512, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1513 = torch.aten.mul.Tensor %49, %1512 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1513, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_808 = torch.constant.int 6
    %1514 = torch.prims.convert_element_type %1513, %int6_808 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1514, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1515 = torch.aten.div.Tensor %1514, %50 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1515, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_809 = torch.constant.float -2.400000e+02
    %float2.400000e02_810 = torch.constant.float 2.400000e+02
    %1516 = torch.aten.clamp %1515, %float-2.400000e02_809, %float2.400000e02_810 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1516, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_811 = torch.constant.int 26
    %1517 = torch.prims.convert_element_type %1516, %int26_811 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1517, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_812 = torch.constant.int 0
    %1518 = torch.aten.unsqueeze %51, %int0_812 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_813 = torch.constant.int 4
    %int4096_814 = torch.constant.int 4096
    %int4096_815 = torch.constant.int 4096
    %1519 = torch.prim.ListConstruct %int4_813, %int4096_814, %int4096_815 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_816 = torch.constant.bool false
    %1520 = torch.aten.expand %1518, %1519, %false_816 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %1521 = torch_c.to_builtin_tensor %1517 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1522 = torch_c.to_builtin_tensor %1520 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %1523 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1521, %1522) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1524 = torch_c.from_builtin_tensor %1523 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1524, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1525 = torch.aten.div.Tensor %1524, %52 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1525, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_817 = torch.constant.float -2.400000e+02
    %float2.400000e02_818 = torch.constant.float 2.400000e+02
    %1526 = torch.aten.clamp %1525, %float-2.400000e02_817, %float2.400000e02_818 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1526, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_819 = torch.constant.int 26
    %1527 = torch.prims.convert_element_type %1526, %int26_819 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1527, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %1528 = torch.aten.div.Tensor %1514, %53 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1528, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_820 = torch.constant.float -2.400000e+02
    %float2.400000e02_821 = torch.constant.float 2.400000e+02
    %1529 = torch.aten.clamp %1528, %float-2.400000e02_820, %float2.400000e02_821 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1529, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_822 = torch.constant.int 26
    %1530 = torch.prims.convert_element_type %1529, %int26_822 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1530, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_823 = torch.constant.int 0
    %1531 = torch.aten.unsqueeze %54, %int0_823 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_824 = torch.constant.int 4
    %int1024_825 = torch.constant.int 1024
    %int4096_826 = torch.constant.int 4096
    %1532 = torch.prim.ListConstruct %int4_824, %int1024_825, %int4096_826 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_827 = torch.constant.bool false
    %1533 = torch.aten.expand %1531, %1532, %false_827 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1534 = torch_c.to_builtin_tensor %1530 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1535 = torch_c.to_builtin_tensor %1533 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1536 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1534, %1535) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %1537 = torch_c.from_builtin_tensor %1536 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1537, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %1538 = torch.aten.div.Tensor %1537, %55 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1538, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_828 = torch.constant.float -2.400000e+02
    %float2.400000e02_829 = torch.constant.float 2.400000e+02
    %1539 = torch.aten.clamp %1538, %float-2.400000e02_828, %float2.400000e02_829 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1539, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_830 = torch.constant.int 26
    %1540 = torch.prims.convert_element_type %1539, %int26_830 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1540, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %1541 = torch.aten.div.Tensor %1514, %56 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1541, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_831 = torch.constant.float -2.400000e+02
    %float2.400000e02_832 = torch.constant.float 2.400000e+02
    %1542 = torch.aten.clamp %1541, %float-2.400000e02_831, %float2.400000e02_832 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1542, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_833 = torch.constant.int 26
    %1543 = torch.prims.convert_element_type %1542, %int26_833 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1543, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_834 = torch.constant.int 0
    %1544 = torch.aten.unsqueeze %57, %int0_834 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_835 = torch.constant.int 4
    %int1024_836 = torch.constant.int 1024
    %int4096_837 = torch.constant.int 4096
    %1545 = torch.prim.ListConstruct %int4_835, %int1024_836, %int4096_837 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_838 = torch.constant.bool false
    %1546 = torch.aten.expand %1544, %1545, %false_838 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1547 = torch_c.to_builtin_tensor %1543 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1548 = torch_c.to_builtin_tensor %1546 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1549 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1547, %1548) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %1550 = torch_c.from_builtin_tensor %1549 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1550, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %1551 = torch.aten.div.Tensor %1550, %58 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1551, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_839 = torch.constant.float -2.400000e+02
    %float2.400000e02_840 = torch.constant.float 2.400000e+02
    %1552 = torch.aten.clamp %1551, %float-2.400000e02_839, %float2.400000e02_840 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1552, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_841 = torch.constant.int 26
    %1553 = torch.prims.convert_element_type %1552, %int26_841 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1553, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %int4_842 = torch.constant.int 4
    %int32_843 = torch.constant.int 32
    %int128_844 = torch.constant.int 128
    %1554 = torch.prim.ListConstruct %int4_842, %777, %int32_843, %int128_844 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1555 = torch.aten.view %1527, %1554 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1555, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int4_845 = torch.constant.int 4
    %int8_846 = torch.constant.int 8
    %int128_847 = torch.constant.int 128
    %1556 = torch.prim.ListConstruct %int4_845, %777, %int8_846, %int128_847 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1557 = torch.aten.view %1540, %1556 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1557, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int4_848 = torch.constant.int 4
    %int8_849 = torch.constant.int 8
    %int128_850 = torch.constant.int 128
    %1558 = torch.prim.ListConstruct %int4_848, %777, %int8_849, %int128_850 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1559 = torch.aten.view %1553, %1558 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1559, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int131072_851 = torch.constant.int 131072
    %none_852 = torch.constant.none
    %none_853 = torch.constant.none
    %cpu_854 = torch.constant.device "cpu"
    %false_855 = torch.constant.bool false
    %1560 = torch.aten.arange %int131072_851, %none_852, %none_853, %cpu_854, %false_855 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_856 = torch.constant.int 0
    %int128_857 = torch.constant.int 128
    %int2_858 = torch.constant.int 2
    %int4_859 = torch.constant.int 4
    %none_860 = torch.constant.none
    %cpu_861 = torch.constant.device "cpu"
    %false_862 = torch.constant.bool false
    %1561 = torch.aten.arange.start_step %int0_856, %int128_857, %int2_858, %int4_859, %none_860, %cpu_861, %false_862 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_863 = torch.constant.int 6
    %1562 = torch.prims.convert_element_type %1561, %int6_863 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_864 = torch.constant.int 128
    %1563 = torch.aten.div.Scalar %1562, %int128_864 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_865 = torch.constant.float 5.000000e+05
    %1564 = torch.aten.pow.Scalar %float5.000000e05_865, %1563 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1565 = torch.aten.reciprocal %1564 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_866 = torch.constant.float 1.000000e+00
    %1566 = torch.aten.mul.Scalar %1565, %float1.000000e00_866 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1567 = torch.aten.reciprocal %1566 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_867 = torch.constant.float 6.2831853071795862
    %1568 = torch.aten.mul.Scalar %1567, %float6.283190e00_867 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_868 = torch.constant.float 8.192000e+03
    %1569 = torch.aten.gt.Scalar %1568, %float8.192000e03_868 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_869 = torch.constant.int 8
    %1570 = torch.aten.div.Scalar %1566, %int8_869 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1571 = torch.aten.where.self %1569, %1570, %1566 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1572 = torch.aten.reciprocal %1568 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_870 = torch.constant.int 8192
    %1573 = torch.aten.mul.Scalar %1572, %int8192_870 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_871 = torch.constant.int 1
    %int1_872 = torch.constant.int 1
    %1574 = torch.aten.sub.Scalar %1573, %int1_871, %int1_872 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_873 = torch.constant.int 3
    %1575 = torch.aten.div.Scalar %1574, %int3_873 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_874 = torch.constant.int 1
    %int1_875 = torch.constant.int 1
    %1576 = torch.aten.rsub.Scalar %1575, %int1_874, %int1_875 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1577 = torch.aten.mul.Tensor %1576, %1571 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_876 = torch.constant.int 8
    %1578 = torch.aten.div.Scalar %1577, %int8_876 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1579 = torch.aten.mul.Tensor %1575, %1571 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_877 = torch.constant.int 1
    %1580 = torch.aten.add.Tensor %1578, %1579, %int1_877 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_878 = torch.constant.float 2.048000e+03
    %1581 = torch.aten.lt.Scalar %1568, %float2.048000e03_878 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1582 = torch.aten.bitwise_not %1581 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_879 = torch.constant.float 8.192000e+03
    %1583 = torch.aten.gt.Scalar %1568, %float8.192000e03_879 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1584 = torch.aten.bitwise_not %1583 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1585 = torch.aten.mul.Tensor %1582, %1584 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1586 = torch.aten.where.self %1585, %1580, %1571 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1587 = torch.prim.ListConstruct %1586, %1586 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_880 = torch.constant.int -1
    %1588 = torch.aten.cat %1587, %int-1_880 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_881 = torch.constant.int 6
    %1589 = torch.prims.convert_element_type %1588, %int6_881 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_882 = torch.constant.int 1
    %1590 = torch.aten.unsqueeze %1560, %int1_882 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_883 = torch.constant.int 6
    %1591 = torch.prims.convert_element_type %1590, %int6_883 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_884 = torch.constant.int 0
    %1592 = torch.aten.unsqueeze %1589, %int0_884 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_885 = torch.constant.int 6
    %1593 = torch.prims.convert_element_type %1592, %int6_885 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %1594 = torch.aten.mul.Tensor %1591, %1593 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %1595 = torch.aten.cos %1594 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_886 = torch.constant.int 15
    %1596 = torch.prims.convert_element_type %1595, %int15_886 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1597 = torch.aten.sin %1594 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_887 = torch.constant.int 15
    %1598 = torch.prims.convert_element_type %1597, %int15_887 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_888 = torch.constant.int 0
    %int0_889 = torch.constant.int 0
    %int1_890 = torch.constant.int 1
    %1599 = torch.aten.slice.Tensor %1596, %int0_888, %int0_889, %777, %int1_890 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1599, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_891 = torch.constant.int 1
    %int0_892 = torch.constant.int 0
    %int9223372036854775807_893 = torch.constant.int 9223372036854775807
    %int1_894 = torch.constant.int 1
    %1600 = torch.aten.slice.Tensor %1599, %int1_891, %int0_892, %int9223372036854775807_893, %int1_894 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1600, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_895 = torch.constant.int 0
    %int0_896 = torch.constant.int 0
    %int1_897 = torch.constant.int 1
    %1601 = torch.aten.slice.Tensor %1598, %int0_895, %int0_896, %777, %int1_897 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1601, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_898 = torch.constant.int 1
    %int0_899 = torch.constant.int 0
    %int9223372036854775807_900 = torch.constant.int 9223372036854775807
    %int1_901 = torch.constant.int 1
    %1602 = torch.aten.slice.Tensor %1601, %int1_898, %int0_899, %int9223372036854775807_900, %int1_901 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1602, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_902 = torch.constant.int 0
    %1603 = torch.aten.unsqueeze %1600, %int0_902 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1603, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_903 = torch.constant.int 1
    %int0_904 = torch.constant.int 0
    %int9223372036854775807_905 = torch.constant.int 9223372036854775807
    %int1_906 = torch.constant.int 1
    %1604 = torch.aten.slice.Tensor %1603, %int1_903, %int0_904, %int9223372036854775807_905, %int1_906 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1604, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_907 = torch.constant.int 2
    %1605 = torch.aten.unsqueeze %1604, %int2_907 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1605, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_908 = torch.constant.int 3
    %int0_909 = torch.constant.int 0
    %int9223372036854775807_910 = torch.constant.int 9223372036854775807
    %int1_911 = torch.constant.int 1
    %1606 = torch.aten.slice.Tensor %1605, %int3_908, %int0_909, %int9223372036854775807_910, %int1_911 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1606, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_912 = torch.constant.int 4
    %int1_913 = torch.constant.int 1
    %int1_914 = torch.constant.int 1
    %int1_915 = torch.constant.int 1
    %1607 = torch.prim.ListConstruct %int4_912, %int1_913, %int1_914, %int1_915 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1608 = torch.aten.repeat %1606, %1607 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1608, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_916 = torch.constant.int 0
    %1609 = torch.aten.unsqueeze %1602, %int0_916 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1609, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_917 = torch.constant.int 1
    %int0_918 = torch.constant.int 0
    %int9223372036854775807_919 = torch.constant.int 9223372036854775807
    %int1_920 = torch.constant.int 1
    %1610 = torch.aten.slice.Tensor %1609, %int1_917, %int0_918, %int9223372036854775807_919, %int1_920 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1610, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_921 = torch.constant.int 2
    %1611 = torch.aten.unsqueeze %1610, %int2_921 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1611, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_922 = torch.constant.int 3
    %int0_923 = torch.constant.int 0
    %int9223372036854775807_924 = torch.constant.int 9223372036854775807
    %int1_925 = torch.constant.int 1
    %1612 = torch.aten.slice.Tensor %1611, %int3_922, %int0_923, %int9223372036854775807_924, %int1_925 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1612, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_926 = torch.constant.int 4
    %int1_927 = torch.constant.int 1
    %int1_928 = torch.constant.int 1
    %int1_929 = torch.constant.int 1
    %1613 = torch.prim.ListConstruct %int4_926, %int1_927, %int1_928, %int1_929 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1614 = torch.aten.repeat %1612, %1613 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1614, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %1615 = torch.aten.mul.Tensor %1555, %1608 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1615, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int3_930 = torch.constant.int 3
    %int0_931 = torch.constant.int 0
    %int64_932 = torch.constant.int 64
    %int1_933 = torch.constant.int 1
    %1616 = torch.aten.slice.Tensor %1555, %int3_930, %int0_931, %int64_932, %int1_933 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1616, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %int3_934 = torch.constant.int 3
    %int64_935 = torch.constant.int 64
    %int9223372036854775807_936 = torch.constant.int 9223372036854775807
    %int1_937 = torch.constant.int 1
    %1617 = torch.aten.slice.Tensor %1555, %int3_934, %int64_935, %int9223372036854775807_936, %int1_937 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1617, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %1618 = torch.aten.neg %1617 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1618, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %1619 = torch.prim.ListConstruct %1618, %1616 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_938 = torch.constant.int -1
    %1620 = torch.aten.cat %1619, %int-1_938 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1620, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %1621 = torch.aten.mul.Tensor %1620, %1614 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1621, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_939 = torch.constant.int 1
    %1622 = torch.aten.add.Tensor %1615, %1621, %int1_939 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1622, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int131072_940 = torch.constant.int 131072
    %none_941 = torch.constant.none
    %none_942 = torch.constant.none
    %cpu_943 = torch.constant.device "cpu"
    %false_944 = torch.constant.bool false
    %1623 = torch.aten.arange %int131072_940, %none_941, %none_942, %cpu_943, %false_944 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_945 = torch.constant.int 0
    %int128_946 = torch.constant.int 128
    %int2_947 = torch.constant.int 2
    %int4_948 = torch.constant.int 4
    %none_949 = torch.constant.none
    %cpu_950 = torch.constant.device "cpu"
    %false_951 = torch.constant.bool false
    %1624 = torch.aten.arange.start_step %int0_945, %int128_946, %int2_947, %int4_948, %none_949, %cpu_950, %false_951 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_952 = torch.constant.int 6
    %1625 = torch.prims.convert_element_type %1624, %int6_952 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_953 = torch.constant.int 128
    %1626 = torch.aten.div.Scalar %1625, %int128_953 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_954 = torch.constant.float 5.000000e+05
    %1627 = torch.aten.pow.Scalar %float5.000000e05_954, %1626 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1628 = torch.aten.reciprocal %1627 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_955 = torch.constant.float 1.000000e+00
    %1629 = torch.aten.mul.Scalar %1628, %float1.000000e00_955 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1630 = torch.aten.reciprocal %1629 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_956 = torch.constant.float 6.2831853071795862
    %1631 = torch.aten.mul.Scalar %1630, %float6.283190e00_956 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_957 = torch.constant.float 8.192000e+03
    %1632 = torch.aten.gt.Scalar %1631, %float8.192000e03_957 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_958 = torch.constant.int 8
    %1633 = torch.aten.div.Scalar %1629, %int8_958 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1634 = torch.aten.where.self %1632, %1633, %1629 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1635 = torch.aten.reciprocal %1631 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_959 = torch.constant.int 8192
    %1636 = torch.aten.mul.Scalar %1635, %int8192_959 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_960 = torch.constant.int 1
    %int1_961 = torch.constant.int 1
    %1637 = torch.aten.sub.Scalar %1636, %int1_960, %int1_961 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_962 = torch.constant.int 3
    %1638 = torch.aten.div.Scalar %1637, %int3_962 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_963 = torch.constant.int 1
    %int1_964 = torch.constant.int 1
    %1639 = torch.aten.rsub.Scalar %1638, %int1_963, %int1_964 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1640 = torch.aten.mul.Tensor %1639, %1634 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_965 = torch.constant.int 8
    %1641 = torch.aten.div.Scalar %1640, %int8_965 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1642 = torch.aten.mul.Tensor %1638, %1634 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_966 = torch.constant.int 1
    %1643 = torch.aten.add.Tensor %1641, %1642, %int1_966 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_967 = torch.constant.float 2.048000e+03
    %1644 = torch.aten.lt.Scalar %1631, %float2.048000e03_967 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1645 = torch.aten.bitwise_not %1644 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_968 = torch.constant.float 8.192000e+03
    %1646 = torch.aten.gt.Scalar %1631, %float8.192000e03_968 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1647 = torch.aten.bitwise_not %1646 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1648 = torch.aten.mul.Tensor %1645, %1647 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1649 = torch.aten.where.self %1648, %1643, %1634 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1650 = torch.prim.ListConstruct %1649, %1649 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_969 = torch.constant.int -1
    %1651 = torch.aten.cat %1650, %int-1_969 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_970 = torch.constant.int 6
    %1652 = torch.prims.convert_element_type %1651, %int6_970 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_971 = torch.constant.int 1
    %1653 = torch.aten.unsqueeze %1623, %int1_971 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_972 = torch.constant.int 6
    %1654 = torch.prims.convert_element_type %1653, %int6_972 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_973 = torch.constant.int 0
    %1655 = torch.aten.unsqueeze %1652, %int0_973 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_974 = torch.constant.int 6
    %1656 = torch.prims.convert_element_type %1655, %int6_974 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %1657 = torch.aten.mul.Tensor %1654, %1656 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %1658 = torch.aten.cos %1657 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_975 = torch.constant.int 15
    %1659 = torch.prims.convert_element_type %1658, %int15_975 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1660 = torch.aten.sin %1657 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_976 = torch.constant.int 15
    %1661 = torch.prims.convert_element_type %1660, %int15_976 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_977 = torch.constant.int 0
    %int0_978 = torch.constant.int 0
    %int1_979 = torch.constant.int 1
    %1662 = torch.aten.slice.Tensor %1659, %int0_977, %int0_978, %777, %int1_979 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1662, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_980 = torch.constant.int 1
    %int0_981 = torch.constant.int 0
    %int9223372036854775807_982 = torch.constant.int 9223372036854775807
    %int1_983 = torch.constant.int 1
    %1663 = torch.aten.slice.Tensor %1662, %int1_980, %int0_981, %int9223372036854775807_982, %int1_983 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1663, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_984 = torch.constant.int 0
    %int0_985 = torch.constant.int 0
    %int1_986 = torch.constant.int 1
    %1664 = torch.aten.slice.Tensor %1661, %int0_984, %int0_985, %777, %int1_986 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1664, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_987 = torch.constant.int 1
    %int0_988 = torch.constant.int 0
    %int9223372036854775807_989 = torch.constant.int 9223372036854775807
    %int1_990 = torch.constant.int 1
    %1665 = torch.aten.slice.Tensor %1664, %int1_987, %int0_988, %int9223372036854775807_989, %int1_990 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1665, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_991 = torch.constant.int 0
    %1666 = torch.aten.unsqueeze %1663, %int0_991 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1666, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_992 = torch.constant.int 1
    %int0_993 = torch.constant.int 0
    %int9223372036854775807_994 = torch.constant.int 9223372036854775807
    %int1_995 = torch.constant.int 1
    %1667 = torch.aten.slice.Tensor %1666, %int1_992, %int0_993, %int9223372036854775807_994, %int1_995 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1667, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_996 = torch.constant.int 2
    %1668 = torch.aten.unsqueeze %1667, %int2_996 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1668, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_997 = torch.constant.int 3
    %int0_998 = torch.constant.int 0
    %int9223372036854775807_999 = torch.constant.int 9223372036854775807
    %int1_1000 = torch.constant.int 1
    %1669 = torch.aten.slice.Tensor %1668, %int3_997, %int0_998, %int9223372036854775807_999, %int1_1000 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1669, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1001 = torch.constant.int 4
    %int1_1002 = torch.constant.int 1
    %int1_1003 = torch.constant.int 1
    %int1_1004 = torch.constant.int 1
    %1670 = torch.prim.ListConstruct %int4_1001, %int1_1002, %int1_1003, %int1_1004 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1671 = torch.aten.repeat %1669, %1670 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1671, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_1005 = torch.constant.int 0
    %1672 = torch.aten.unsqueeze %1665, %int0_1005 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1672, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1006 = torch.constant.int 1
    %int0_1007 = torch.constant.int 0
    %int9223372036854775807_1008 = torch.constant.int 9223372036854775807
    %int1_1009 = torch.constant.int 1
    %1673 = torch.aten.slice.Tensor %1672, %int1_1006, %int0_1007, %int9223372036854775807_1008, %int1_1009 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1673, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1010 = torch.constant.int 2
    %1674 = torch.aten.unsqueeze %1673, %int2_1010 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1674, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1011 = torch.constant.int 3
    %int0_1012 = torch.constant.int 0
    %int9223372036854775807_1013 = torch.constant.int 9223372036854775807
    %int1_1014 = torch.constant.int 1
    %1675 = torch.aten.slice.Tensor %1674, %int3_1011, %int0_1012, %int9223372036854775807_1013, %int1_1014 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1675, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1015 = torch.constant.int 4
    %int1_1016 = torch.constant.int 1
    %int1_1017 = torch.constant.int 1
    %int1_1018 = torch.constant.int 1
    %1676 = torch.prim.ListConstruct %int4_1015, %int1_1016, %int1_1017, %int1_1018 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1677 = torch.aten.repeat %1675, %1676 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1677, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %1678 = torch.aten.mul.Tensor %1557, %1671 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1678, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int3_1019 = torch.constant.int 3
    %int0_1020 = torch.constant.int 0
    %int64_1021 = torch.constant.int 64
    %int1_1022 = torch.constant.int 1
    %1679 = torch.aten.slice.Tensor %1557, %int3_1019, %int0_1020, %int64_1021, %int1_1022 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1679, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %int3_1023 = torch.constant.int 3
    %int64_1024 = torch.constant.int 64
    %int9223372036854775807_1025 = torch.constant.int 9223372036854775807
    %int1_1026 = torch.constant.int 1
    %1680 = torch.aten.slice.Tensor %1557, %int3_1023, %int64_1024, %int9223372036854775807_1025, %int1_1026 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1680, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %1681 = torch.aten.neg %1680 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1681, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %1682 = torch.prim.ListConstruct %1681, %1679 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_1027 = torch.constant.int -1
    %1683 = torch.aten.cat %1682, %int-1_1027 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1683, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %1684 = torch.aten.mul.Tensor %1683, %1677 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1684, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int1_1028 = torch.constant.int 1
    %1685 = torch.aten.add.Tensor %1678, %1684, %int1_1028 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1685, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int64_1029 = torch.constant.int 64
    %1686 = torch.aten.mul.Scalar %arg2, %int64_1029 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %1686, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_1030 = torch.constant.int 4
    %int1_1031 = torch.constant.int 1
    %1687 = torch.aten.add.Scalar %1686, %int4_1030, %int1_1031 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %1687, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_1032 = torch.constant.int 4
    %int32_1033 = torch.constant.int 32
    %int8_1034 = torch.constant.int 8
    %int128_1035 = torch.constant.int 128
    %1688 = torch.prim.ListConstruct %int4_1032, %775, %int32_1033, %int8_1034, %int128_1035 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1689 = torch.aten.view %1685, %1688 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1689, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_1036 = torch.constant.int 32
    %int8_1037 = torch.constant.int 8
    %int128_1038 = torch.constant.int 128
    %1690 = torch.prim.ListConstruct %997, %int32_1036, %int8_1037, %int128_1038 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1691 = torch.aten.view %1689, %1690 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1691, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1692 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %1693 = torch.aten.view %1687, %1692 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1693, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_1039 = torch.constant.int 26
    %1694 = torch.prims.convert_element_type %1691, %int26_1039 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1694, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1040 = torch.constant.int 1
    %1695 = torch.aten.view.dtype %1694, %int1_1040 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1695, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1696 = torch.aten.detach %1695 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1696, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1697 = torch.aten.detach %1696 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1697, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_1041 = torch.constant.int 32
    %int2_1042 = torch.constant.int 2
    %int32_1043 = torch.constant.int 32
    %int8_1044 = torch.constant.int 8
    %int128_1045 = torch.constant.int 128
    %1698 = torch.prim.ListConstruct %776, %int32_1041, %int2_1042, %int32_1043, %int8_1044, %int128_1045 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1699 = torch.aten.view %1392, %1698 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1699, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1046 = torch.constant.int 32
    %int8_1047 = torch.constant.int 8
    %int128_1048 = torch.constant.int 128
    %1700 = torch.prim.ListConstruct %990, %int32_1046, %int8_1047, %int128_1048 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1701 = torch.aten.view %1699, %1700 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1701, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1049 = torch.constant.int 1
    %1702 = torch.aten.view.dtype %1701, %int1_1049 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1702, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1703 = torch.aten.detach %1702 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1703, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1704 = torch.aten.detach %1703 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1704, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1705 = torch.prim.ListConstruct %1693 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1050 = torch.constant.bool false
    %1706 = torch.aten.index_put %1704, %1705, %1697, %false_1050 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1706, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_1051 = torch.constant.int 26
    %1707 = torch.aten.view.dtype %1706, %int26_1051 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1707, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1708 = torch.aten.detach %1707 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1708, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1709 = torch.aten.detach %1708 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1709, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1052 = torch.constant.int 32
    %int2_1053 = torch.constant.int 2
    %int32_1054 = torch.constant.int 32
    %int8_1055 = torch.constant.int 8
    %int128_1056 = torch.constant.int 128
    %1710 = torch.prim.ListConstruct %776, %int32_1052, %int2_1053, %int32_1054, %int8_1055, %int128_1056 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1711 = torch.aten.view %1709, %1710 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1711, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1057 = torch.constant.int 2097152
    %1712 = torch.prim.ListConstruct %776, %int2097152_1057 : (!torch.int, !torch.int) -> !torch.list<int>
    %1713 = torch.aten.view %1711, %1712 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1713, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int4_1058 = torch.constant.int 4
    %int32_1059 = torch.constant.int 32
    %int8_1060 = torch.constant.int 8
    %int128_1061 = torch.constant.int 128
    %1714 = torch.prim.ListConstruct %int4_1058, %775, %int32_1059, %int8_1060, %int128_1061 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1715 = torch.aten.view %1559, %1714 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1715, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_1062 = torch.constant.int 32
    %int8_1063 = torch.constant.int 8
    %int128_1064 = torch.constant.int 128
    %1716 = torch.prim.ListConstruct %997, %int32_1062, %int8_1063, %int128_1064 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1717 = torch.aten.view %1715, %1716 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1717, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1065 = torch.constant.int 1
    %int1_1066 = torch.constant.int 1
    %1718 = torch.aten.add.Scalar %1687, %int1_1065, %int1_1066 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %1718, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %1719 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %1720 = torch.aten.view %1718, %1719 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1720, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_1067 = torch.constant.int 26
    %1721 = torch.prims.convert_element_type %1717, %int26_1067 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1721, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1068 = torch.constant.int 1
    %1722 = torch.aten.view.dtype %1721, %int1_1068 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1722, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1723 = torch.aten.detach %1722 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1723, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1724 = torch.aten.detach %1723 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1724, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_1069 = torch.constant.int 32
    %int2_1070 = torch.constant.int 2
    %int32_1071 = torch.constant.int 32
    %int8_1072 = torch.constant.int 8
    %int128_1073 = torch.constant.int 128
    %1725 = torch.prim.ListConstruct %776, %int32_1069, %int2_1070, %int32_1071, %int8_1072, %int128_1073 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1726 = torch.aten.view %1713, %1725 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1726, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1074 = torch.constant.int 32
    %int8_1075 = torch.constant.int 8
    %int128_1076 = torch.constant.int 128
    %1727 = torch.prim.ListConstruct %990, %int32_1074, %int8_1075, %int128_1076 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1728 = torch.aten.view %1726, %1727 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1728, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1077 = torch.constant.int 1
    %1729 = torch.aten.view.dtype %1728, %int1_1077 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1729, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1730 = torch.aten.detach %1729 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1730, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1731 = torch.aten.detach %1730 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1731, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %1732 = torch.prim.ListConstruct %1720 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1078 = torch.constant.bool false
    %1733 = torch.aten.index_put %1731, %1732, %1724, %false_1078 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %1733, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_1079 = torch.constant.int 26
    %1734 = torch.aten.view.dtype %1733, %int26_1079 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1734, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1735 = torch.aten.detach %1734 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1735, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1736 = torch.aten.detach %1735 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1736, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1080 = torch.constant.int 32
    %int2_1081 = torch.constant.int 2
    %int32_1082 = torch.constant.int 32
    %int8_1083 = torch.constant.int 8
    %int128_1084 = torch.constant.int 128
    %1737 = torch.prim.ListConstruct %776, %int32_1080, %int2_1081, %int32_1082, %int8_1083, %int128_1084 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1738 = torch.aten.view %1736, %1737 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1738, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1085 = torch.constant.int 2097152
    %1739 = torch.prim.ListConstruct %776, %int2097152_1085 : (!torch.int, !torch.int) -> !torch.list<int>
    %1740 = torch.aten.view %1738, %1739 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1740, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_1086 = torch.constant.int -2
    %1741 = torch.aten.unsqueeze %1685, %int-2_1086 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1741, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_1087 = torch.constant.int 4
    %int8_1088 = torch.constant.int 8
    %int4_1089 = torch.constant.int 4
    %int128_1090 = torch.constant.int 128
    %1742 = torch.prim.ListConstruct %int4_1087, %777, %int8_1088, %int4_1089, %int128_1090 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1091 = torch.constant.bool false
    %1743 = torch.aten.expand %1741, %1742, %false_1091 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1743, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_1092 = torch.constant.int 0
    %1744 = torch.aten.clone %1743, %int0_1092 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1744, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_1093 = torch.constant.int 4
    %int32_1094 = torch.constant.int 32
    %int128_1095 = torch.constant.int 128
    %1745 = torch.prim.ListConstruct %int4_1093, %777, %int32_1094, %int128_1095 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1746 = torch.aten._unsafe_view %1744, %1745 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1746, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int-2_1096 = torch.constant.int -2
    %1747 = torch.aten.unsqueeze %1559, %int-2_1096 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1747, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_1097 = torch.constant.int 4
    %int8_1098 = torch.constant.int 8
    %int4_1099 = torch.constant.int 4
    %int128_1100 = torch.constant.int 128
    %1748 = torch.prim.ListConstruct %int4_1097, %777, %int8_1098, %int4_1099, %int128_1100 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1101 = torch.constant.bool false
    %1749 = torch.aten.expand %1747, %1748, %false_1101 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1749, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_1102 = torch.constant.int 0
    %1750 = torch.aten.clone %1749, %int0_1102 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1750, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_1103 = torch.constant.int 4
    %int32_1104 = torch.constant.int 32
    %int128_1105 = torch.constant.int 128
    %1751 = torch.prim.ListConstruct %int4_1103, %777, %int32_1104, %int128_1105 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1752 = torch.aten._unsafe_view %1750, %1751 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1752, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_1106 = torch.constant.int 1
    %int2_1107 = torch.constant.int 2
    %1753 = torch.aten.transpose.int %1622, %int1_1106, %int2_1107 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1753, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_1108 = torch.constant.int 1
    %int2_1109 = torch.constant.int 2
    %1754 = torch.aten.transpose.int %1746, %int1_1108, %int2_1109 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1754, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_1110 = torch.constant.int 1
    %int2_1111 = torch.constant.int 2
    %1755 = torch.aten.transpose.int %1752, %int1_1110, %int2_1111 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1755, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1112 = torch.constant.int 26
    %1756 = torch.prims.convert_element_type %1753, %int26_1112 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1756, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1113 = torch.constant.int 26
    %1757 = torch.prims.convert_element_type %1754, %int26_1113 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1757, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1114 = torch.constant.int 26
    %1758 = torch.prims.convert_element_type %1755, %int26_1114 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1758, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1115 = torch.constant.int 26
    %1759 = torch.prims.convert_element_type %803, %int26_1115 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1759, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    %int0_1116 = torch.constant.int 0
    %int0_1117 = torch.constant.int 0
    %1760 = torch.aten.select.int %1759, %int0_1116, %int0_1117 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1760, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    %int0_1118 = torch.constant.int 0
    %int0_1119 = torch.constant.int 0
    %1761 = torch.aten.select.int %1760, %int0_1118, %int0_1119 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1761, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int0_1120 = torch.constant.int 0
    %int0_1121 = torch.constant.int 0
    %int9223372036854775807_1122 = torch.constant.int 9223372036854775807
    %int1_1123 = torch.constant.int 1
    %1762 = torch.aten.slice.Tensor %1761, %int0_1120, %int0_1121, %int9223372036854775807_1122, %int1_1123 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1762, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int1_1124 = torch.constant.int 1
    %int0_1125 = torch.constant.int 0
    %int9223372036854775807_1126 = torch.constant.int 9223372036854775807
    %int1_1127 = torch.constant.int 1
    %1763 = torch.aten.slice.Tensor %1762, %int1_1124, %int0_1125, %int9223372036854775807_1126, %int1_1127 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1763, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %none_1128 = torch.constant.none
    %1764 = torch.aten.clone %59, %none_1128 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
    %1765 = torch.aten.detach %1764 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1766 = torch.aten.detach %1765 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1767 = torch.aten.detach %1766 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %1768 = torch_c.to_builtin_tensor %1756 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1769 = torch_c.to_builtin_tensor %1757 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1770 = torch_c.to_builtin_tensor %1758 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %1771 = torch_c.to_builtin_tensor %1763 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ>
    %1772 = torch_c.to_builtin_tensor %1767 : !torch.vtensor<[],f32> -> tensor<f32>
    %1773 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%1768, %1769, %1770, %1772, %1771) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %1774 = torch_c.from_builtin_tensor %1773 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32>
    torch.bind_symbolic_shape %1774, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32>
    %int1_1129 = torch.constant.int 1
    %int2_1130 = torch.constant.int 2
    %1775 = torch.aten.transpose.int %1774, %int1_1129, %int2_1130 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %1775, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int0_1131 = torch.constant.int 0
    %1776 = torch.aten.clone %1775, %int0_1131 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %1776, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int4_1132 = torch.constant.int 4
    %int4096_1133 = torch.constant.int 4096
    %1777 = torch.prim.ListConstruct %int4_1132, %777, %int4096_1133 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1778 = torch.aten._unsafe_view %1776, %1777 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1778, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1779 = torch.aten.div.Tensor %1778, %60 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1779, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1134 = torch.constant.float -2.400000e+02
    %float2.400000e02_1135 = torch.constant.float 2.400000e+02
    %1780 = torch.aten.clamp %1779, %float-2.400000e02_1134, %float2.400000e02_1135 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1780, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1136 = torch.constant.int 26
    %1781 = torch.prims.convert_element_type %1780, %int26_1136 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1781, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1137 = torch.constant.int 0
    %1782 = torch.aten.unsqueeze %61, %int0_1137 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_1138 = torch.constant.int 4
    %int4096_1139 = torch.constant.int 4096
    %int4096_1140 = torch.constant.int 4096
    %1783 = torch.prim.ListConstruct %int4_1138, %int4096_1139, %int4096_1140 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1141 = torch.constant.bool false
    %1784 = torch.aten.expand %1782, %1783, %false_1141 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %1785 = torch_c.to_builtin_tensor %1781 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1786 = torch_c.to_builtin_tensor %1784 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %1787 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1785, %1786) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1788 = torch_c.from_builtin_tensor %1787 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1788, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1789 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1790 = torch.aten.permute %62, %1789 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1791 = torch.aten.mul.Tensor %60, %1790 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1142 = torch.constant.int 6
    %1792 = torch.prims.convert_element_type %1788, %int6_1142 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1792, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1793 = torch.aten.mul.Tensor %1792, %1791 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1793, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_1143 = torch.constant.int 1
    %1794 = torch.aten.add.Tensor %1504, %1793, %int1_1143 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1794, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1144 = torch.constant.int 6
    %1795 = torch.prims.convert_element_type %1794, %int6_1144 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1795, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_1145 = torch.constant.int 2
    %1796 = torch.aten.pow.Tensor_Scalar %1795, %int2_1145 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1796, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_1146 = torch.constant.int -1
    %1797 = torch.prim.ListConstruct %int-1_1146 : (!torch.int) -> !torch.list<int>
    %true_1147 = torch.constant.bool true
    %none_1148 = torch.constant.none
    %1798 = torch.aten.mean.dim %1796, %1797, %true_1147, %none_1148 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1798, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_1149 = torch.constant.float 1.000000e-05
    %int1_1150 = torch.constant.int 1
    %1799 = torch.aten.add.Scalar %1798, %float1.000000e-05_1149, %int1_1150 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1799, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1800 = torch.aten.rsqrt %1799 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1800, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1801 = torch.aten.mul.Tensor %1795, %1800 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1801, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1151 = torch.constant.int 6
    %1802 = torch.prims.convert_element_type %1801, %int6_1151 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1802, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1803 = torch.aten.mul.Tensor %63, %1802 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1803, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1152 = torch.constant.int 6
    %1804 = torch.prims.convert_element_type %1803, %int6_1152 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1804, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1805 = torch.aten.div.Tensor %1804, %64 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1805, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1153 = torch.constant.float -2.400000e+02
    %float2.400000e02_1154 = torch.constant.float 2.400000e+02
    %1806 = torch.aten.clamp %1805, %float-2.400000e02_1153, %float2.400000e02_1154 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1806, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1155 = torch.constant.int 26
    %1807 = torch.prims.convert_element_type %1806, %int26_1155 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1807, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1156 = torch.constant.int 0
    %1808 = torch.aten.unsqueeze %65, %int0_1156 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_1157 = torch.constant.int 4
    %int14336_1158 = torch.constant.int 14336
    %int4096_1159 = torch.constant.int 4096
    %1809 = torch.prim.ListConstruct %int4_1157, %int14336_1158, %int4096_1159 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1160 = torch.constant.bool false
    %1810 = torch.aten.expand %1808, %1809, %false_1160 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %1811 = torch_c.to_builtin_tensor %1807 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1812 = torch_c.to_builtin_tensor %1810 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %1813 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1811, %1812) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %1814 = torch_c.from_builtin_tensor %1813 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1814, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1815 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1816 = torch.aten.permute %66, %1815 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1817 = torch.aten.mul.Tensor %64, %1816 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1161 = torch.constant.int 6
    %1818 = torch.prims.convert_element_type %1814, %int6_1161 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1818, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1819 = torch.aten.mul.Tensor %1818, %1817 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1819, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1820 = torch.aten.silu %1819 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1820, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1821 = torch.aten.div.Tensor %1804, %67 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1821, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1162 = torch.constant.float -2.400000e+02
    %float2.400000e02_1163 = torch.constant.float 2.400000e+02
    %1822 = torch.aten.clamp %1821, %float-2.400000e02_1162, %float2.400000e02_1163 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1822, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1164 = torch.constant.int 26
    %1823 = torch.prims.convert_element_type %1822, %int26_1164 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1823, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1165 = torch.constant.int 0
    %1824 = torch.aten.unsqueeze %68, %int0_1165 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_1166 = torch.constant.int 4
    %int14336_1167 = torch.constant.int 14336
    %int4096_1168 = torch.constant.int 4096
    %1825 = torch.prim.ListConstruct %int4_1166, %int14336_1167, %int4096_1168 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1169 = torch.constant.bool false
    %1826 = torch.aten.expand %1824, %1825, %false_1169 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %1827 = torch_c.to_builtin_tensor %1823 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1828 = torch_c.to_builtin_tensor %1826 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %1829 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1827, %1828) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %1830 = torch_c.from_builtin_tensor %1829 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1830, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1831 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1832 = torch.aten.permute %69, %1831 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1833 = torch.aten.mul.Tensor %67, %1832 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1170 = torch.constant.int 6
    %1834 = torch.prims.convert_element_type %1830, %int6_1170 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1834, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1835 = torch.aten.mul.Tensor %1834, %1833 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1835, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1836 = torch.aten.mul.Tensor %1820, %1835 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1836, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %1837 = torch.aten.div.Tensor %1836, %70 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1837, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %float-2.400000e02_1171 = torch.constant.float -2.400000e+02
    %float2.400000e02_1172 = torch.constant.float 2.400000e+02
    %1838 = torch.aten.clamp %1837, %float-2.400000e02_1171, %float2.400000e02_1172 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %1838, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %int26_1173 = torch.constant.int 26
    %1839 = torch.prims.convert_element_type %1838, %int26_1173 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1839, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    %int0_1174 = torch.constant.int 0
    %1840 = torch.aten.unsqueeze %71, %int0_1174 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %int4_1175 = torch.constant.int 4
    %int4096_1176 = torch.constant.int 4096
    %int14336_1177 = torch.constant.int 14336
    %1841 = torch.prim.ListConstruct %int4_1175, %int4096_1176, %int14336_1177 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1178 = torch.constant.bool false
    %1842 = torch.aten.expand %1840, %1841, %false_1178 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %1843 = torch_c.to_builtin_tensor %1839 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ>
    %1844 = torch_c.to_builtin_tensor %1842 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ>
    %1845 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%1843, %1844) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1846 = torch_c.from_builtin_tensor %1845 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1846, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1847 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %1848 = torch.aten.permute %72, %1847 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %1849 = torch.aten.mul.Tensor %70, %1848 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1179 = torch.constant.int 6
    %1850 = torch.prims.convert_element_type %1846, %int6_1179 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1850, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1851 = torch.aten.mul.Tensor %1850, %1849 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1851, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_1180 = torch.constant.int 1
    %1852 = torch.aten.add.Tensor %1794, %1851, %int1_1180 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1852, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1181 = torch.constant.int 6
    %1853 = torch.prims.convert_element_type %1852, %int6_1181 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1853, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_1182 = torch.constant.int 2
    %1854 = torch.aten.pow.Tensor_Scalar %1853, %int2_1182 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1854, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_1183 = torch.constant.int -1
    %1855 = torch.prim.ListConstruct %int-1_1183 : (!torch.int) -> !torch.list<int>
    %true_1184 = torch.constant.bool true
    %none_1185 = torch.constant.none
    %1856 = torch.aten.mean.dim %1854, %1855, %true_1184, %none_1185 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1856, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_1186 = torch.constant.float 1.000000e-05
    %int1_1187 = torch.constant.int 1
    %1857 = torch.aten.add.Scalar %1856, %float1.000000e-05_1186, %int1_1187 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1857, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1858 = torch.aten.rsqrt %1857 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %1858, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %1859 = torch.aten.mul.Tensor %1853, %1858 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1859, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1188 = torch.constant.int 6
    %1860 = torch.prims.convert_element_type %1859, %int6_1188 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1860, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1861 = torch.aten.mul.Tensor %73, %1860 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1861, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1189 = torch.constant.int 6
    %1862 = torch.prims.convert_element_type %1861, %int6_1189 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1862, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1863 = torch.aten.div.Tensor %1862, %74 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1863, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1190 = torch.constant.float -2.400000e+02
    %float2.400000e02_1191 = torch.constant.float 2.400000e+02
    %1864 = torch.aten.clamp %1863, %float-2.400000e02_1190, %float2.400000e02_1191 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1864, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1192 = torch.constant.int 26
    %1865 = torch.prims.convert_element_type %1864, %int26_1192 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1865, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1193 = torch.constant.int 0
    %1866 = torch.aten.unsqueeze %75, %int0_1193 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_1194 = torch.constant.int 4
    %int4096_1195 = torch.constant.int 4096
    %int4096_1196 = torch.constant.int 4096
    %1867 = torch.prim.ListConstruct %int4_1194, %int4096_1195, %int4096_1196 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1197 = torch.constant.bool false
    %1868 = torch.aten.expand %1866, %1867, %false_1197 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %1869 = torch_c.to_builtin_tensor %1865 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1870 = torch_c.to_builtin_tensor %1868 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %1871 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1869, %1870) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %1872 = torch_c.from_builtin_tensor %1871 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1872, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %1873 = torch.aten.div.Tensor %1872, %76 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1873, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1198 = torch.constant.float -2.400000e+02
    %float2.400000e02_1199 = torch.constant.float 2.400000e+02
    %1874 = torch.aten.clamp %1873, %float-2.400000e02_1198, %float2.400000e02_1199 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1874, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1200 = torch.constant.int 26
    %1875 = torch.prims.convert_element_type %1874, %int26_1200 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1875, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %1876 = torch.aten.div.Tensor %1862, %77 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1876, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1201 = torch.constant.float -2.400000e+02
    %float2.400000e02_1202 = torch.constant.float 2.400000e+02
    %1877 = torch.aten.clamp %1876, %float-2.400000e02_1201, %float2.400000e02_1202 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1877, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1203 = torch.constant.int 26
    %1878 = torch.prims.convert_element_type %1877, %int26_1203 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1878, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1204 = torch.constant.int 0
    %1879 = torch.aten.unsqueeze %78, %int0_1204 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_1205 = torch.constant.int 4
    %int1024_1206 = torch.constant.int 1024
    %int4096_1207 = torch.constant.int 4096
    %1880 = torch.prim.ListConstruct %int4_1205, %int1024_1206, %int4096_1207 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1208 = torch.constant.bool false
    %1881 = torch.aten.expand %1879, %1880, %false_1208 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1882 = torch_c.to_builtin_tensor %1878 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1883 = torch_c.to_builtin_tensor %1881 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1884 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1882, %1883) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %1885 = torch_c.from_builtin_tensor %1884 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1885, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %1886 = torch.aten.div.Tensor %1885, %79 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1886, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_1209 = torch.constant.float -2.400000e+02
    %float2.400000e02_1210 = torch.constant.float 2.400000e+02
    %1887 = torch.aten.clamp %1886, %float-2.400000e02_1209, %float2.400000e02_1210 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1887, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_1211 = torch.constant.int 26
    %1888 = torch.prims.convert_element_type %1887, %int26_1211 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1888, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %1889 = torch.aten.div.Tensor %1862, %80 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1889, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1212 = torch.constant.float -2.400000e+02
    %float2.400000e02_1213 = torch.constant.float 2.400000e+02
    %1890 = torch.aten.clamp %1889, %float-2.400000e02_1212, %float2.400000e02_1213 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %1890, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1214 = torch.constant.int 26
    %1891 = torch.prims.convert_element_type %1890, %int26_1214 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1891, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1215 = torch.constant.int 0
    %1892 = torch.aten.unsqueeze %81, %int0_1215 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_1216 = torch.constant.int 4
    %int1024_1217 = torch.constant.int 1024
    %int4096_1218 = torch.constant.int 4096
    %1893 = torch.prim.ListConstruct %int4_1216, %int1024_1217, %int4096_1218 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1219 = torch.constant.bool false
    %1894 = torch.aten.expand %1892, %1893, %false_1219 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %1895 = torch_c.to_builtin_tensor %1891 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %1896 = torch_c.to_builtin_tensor %1894 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %1897 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1895, %1896) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %1898 = torch_c.from_builtin_tensor %1897 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1898, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %1899 = torch.aten.div.Tensor %1898, %82 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1899, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_1220 = torch.constant.float -2.400000e+02
    %float2.400000e02_1221 = torch.constant.float 2.400000e+02
    %1900 = torch.aten.clamp %1899, %float-2.400000e02_1220, %float2.400000e02_1221 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %1900, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_1222 = torch.constant.int 26
    %1901 = torch.prims.convert_element_type %1900, %int26_1222 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1901, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %int4_1223 = torch.constant.int 4
    %int32_1224 = torch.constant.int 32
    %int128_1225 = torch.constant.int 128
    %1902 = torch.prim.ListConstruct %int4_1223, %777, %int32_1224, %int128_1225 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1903 = torch.aten.view %1875, %1902 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1903, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int4_1226 = torch.constant.int 4
    %int8_1227 = torch.constant.int 8
    %int128_1228 = torch.constant.int 128
    %1904 = torch.prim.ListConstruct %int4_1226, %777, %int8_1227, %int128_1228 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1905 = torch.aten.view %1888, %1904 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1905, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int4_1229 = torch.constant.int 4
    %int8_1230 = torch.constant.int 8
    %int128_1231 = torch.constant.int 128
    %1906 = torch.prim.ListConstruct %int4_1229, %777, %int8_1230, %int128_1231 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1907 = torch.aten.view %1901, %1906 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1907, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int131072_1232 = torch.constant.int 131072
    %none_1233 = torch.constant.none
    %none_1234 = torch.constant.none
    %cpu_1235 = torch.constant.device "cpu"
    %false_1236 = torch.constant.bool false
    %1908 = torch.aten.arange %int131072_1232, %none_1233, %none_1234, %cpu_1235, %false_1236 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1237 = torch.constant.int 0
    %int128_1238 = torch.constant.int 128
    %int2_1239 = torch.constant.int 2
    %int4_1240 = torch.constant.int 4
    %none_1241 = torch.constant.none
    %cpu_1242 = torch.constant.device "cpu"
    %false_1243 = torch.constant.bool false
    %1909 = torch.aten.arange.start_step %int0_1237, %int128_1238, %int2_1239, %int4_1240, %none_1241, %cpu_1242, %false_1243 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1244 = torch.constant.int 6
    %1910 = torch.prims.convert_element_type %1909, %int6_1244 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1245 = torch.constant.int 128
    %1911 = torch.aten.div.Scalar %1910, %int128_1245 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1246 = torch.constant.float 5.000000e+05
    %1912 = torch.aten.pow.Scalar %float5.000000e05_1246, %1911 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1913 = torch.aten.reciprocal %1912 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1247 = torch.constant.float 1.000000e+00
    %1914 = torch.aten.mul.Scalar %1913, %float1.000000e00_1247 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1915 = torch.aten.reciprocal %1914 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1248 = torch.constant.float 6.2831853071795862
    %1916 = torch.aten.mul.Scalar %1915, %float6.283190e00_1248 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1249 = torch.constant.float 8.192000e+03
    %1917 = torch.aten.gt.Scalar %1916, %float8.192000e03_1249 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1250 = torch.constant.int 8
    %1918 = torch.aten.div.Scalar %1914, %int8_1250 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1919 = torch.aten.where.self %1917, %1918, %1914 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1920 = torch.aten.reciprocal %1916 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1251 = torch.constant.int 8192
    %1921 = torch.aten.mul.Scalar %1920, %int8192_1251 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1252 = torch.constant.int 1
    %int1_1253 = torch.constant.int 1
    %1922 = torch.aten.sub.Scalar %1921, %int1_1252, %int1_1253 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1254 = torch.constant.int 3
    %1923 = torch.aten.div.Scalar %1922, %int3_1254 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1255 = torch.constant.int 1
    %int1_1256 = torch.constant.int 1
    %1924 = torch.aten.rsub.Scalar %1923, %int1_1255, %int1_1256 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1925 = torch.aten.mul.Tensor %1924, %1919 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1257 = torch.constant.int 8
    %1926 = torch.aten.div.Scalar %1925, %int8_1257 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1927 = torch.aten.mul.Tensor %1923, %1919 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1258 = torch.constant.int 1
    %1928 = torch.aten.add.Tensor %1926, %1927, %int1_1258 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1259 = torch.constant.float 2.048000e+03
    %1929 = torch.aten.lt.Scalar %1916, %float2.048000e03_1259 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1930 = torch.aten.bitwise_not %1929 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1260 = torch.constant.float 8.192000e+03
    %1931 = torch.aten.gt.Scalar %1916, %float8.192000e03_1260 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1932 = torch.aten.bitwise_not %1931 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1933 = torch.aten.mul.Tensor %1930, %1932 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1934 = torch.aten.where.self %1933, %1928, %1919 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1935 = torch.prim.ListConstruct %1934, %1934 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1261 = torch.constant.int -1
    %1936 = torch.aten.cat %1935, %int-1_1261 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1262 = torch.constant.int 6
    %1937 = torch.prims.convert_element_type %1936, %int6_1262 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_1263 = torch.constant.int 1
    %1938 = torch.aten.unsqueeze %1908, %int1_1263 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_1264 = torch.constant.int 6
    %1939 = torch.prims.convert_element_type %1938, %int6_1264 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_1265 = torch.constant.int 0
    %1940 = torch.aten.unsqueeze %1937, %int0_1265 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_1266 = torch.constant.int 6
    %1941 = torch.prims.convert_element_type %1940, %int6_1266 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %1942 = torch.aten.mul.Tensor %1939, %1941 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %1943 = torch.aten.cos %1942 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1267 = torch.constant.int 15
    %1944 = torch.prims.convert_element_type %1943, %int15_1267 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1945 = torch.aten.sin %1942 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1268 = torch.constant.int 15
    %1946 = torch.prims.convert_element_type %1945, %int15_1268 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_1269 = torch.constant.int 0
    %int0_1270 = torch.constant.int 0
    %int1_1271 = torch.constant.int 1
    %1947 = torch.aten.slice.Tensor %1944, %int0_1269, %int0_1270, %777, %int1_1271 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1947, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1272 = torch.constant.int 1
    %int0_1273 = torch.constant.int 0
    %int9223372036854775807_1274 = torch.constant.int 9223372036854775807
    %int1_1275 = torch.constant.int 1
    %1948 = torch.aten.slice.Tensor %1947, %int1_1272, %int0_1273, %int9223372036854775807_1274, %int1_1275 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1948, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1276 = torch.constant.int 0
    %int0_1277 = torch.constant.int 0
    %int1_1278 = torch.constant.int 1
    %1949 = torch.aten.slice.Tensor %1946, %int0_1276, %int0_1277, %777, %int1_1278 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1949, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1279 = torch.constant.int 1
    %int0_1280 = torch.constant.int 0
    %int9223372036854775807_1281 = torch.constant.int 9223372036854775807
    %int1_1282 = torch.constant.int 1
    %1950 = torch.aten.slice.Tensor %1949, %int1_1279, %int0_1280, %int9223372036854775807_1281, %int1_1282 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1950, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1283 = torch.constant.int 0
    %1951 = torch.aten.unsqueeze %1948, %int0_1283 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1951, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1284 = torch.constant.int 1
    %int0_1285 = torch.constant.int 0
    %int9223372036854775807_1286 = torch.constant.int 9223372036854775807
    %int1_1287 = torch.constant.int 1
    %1952 = torch.aten.slice.Tensor %1951, %int1_1284, %int0_1285, %int9223372036854775807_1286, %int1_1287 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1952, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1288 = torch.constant.int 2
    %1953 = torch.aten.unsqueeze %1952, %int2_1288 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1953, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1289 = torch.constant.int 3
    %int0_1290 = torch.constant.int 0
    %int9223372036854775807_1291 = torch.constant.int 9223372036854775807
    %int1_1292 = torch.constant.int 1
    %1954 = torch.aten.slice.Tensor %1953, %int3_1289, %int0_1290, %int9223372036854775807_1291, %int1_1292 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1954, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1293 = torch.constant.int 4
    %int1_1294 = torch.constant.int 1
    %int1_1295 = torch.constant.int 1
    %int1_1296 = torch.constant.int 1
    %1955 = torch.prim.ListConstruct %int4_1293, %int1_1294, %int1_1295, %int1_1296 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1956 = torch.aten.repeat %1954, %1955 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1956, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_1297 = torch.constant.int 0
    %1957 = torch.aten.unsqueeze %1950, %int0_1297 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1957, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1298 = torch.constant.int 1
    %int0_1299 = torch.constant.int 0
    %int9223372036854775807_1300 = torch.constant.int 9223372036854775807
    %int1_1301 = torch.constant.int 1
    %1958 = torch.aten.slice.Tensor %1957, %int1_1298, %int0_1299, %int9223372036854775807_1300, %int1_1301 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1958, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1302 = torch.constant.int 2
    %1959 = torch.aten.unsqueeze %1958, %int2_1302 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1959, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1303 = torch.constant.int 3
    %int0_1304 = torch.constant.int 0
    %int9223372036854775807_1305 = torch.constant.int 9223372036854775807
    %int1_1306 = torch.constant.int 1
    %1960 = torch.aten.slice.Tensor %1959, %int3_1303, %int0_1304, %int9223372036854775807_1305, %int1_1306 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1960, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1307 = torch.constant.int 4
    %int1_1308 = torch.constant.int 1
    %int1_1309 = torch.constant.int 1
    %int1_1310 = torch.constant.int 1
    %1961 = torch.prim.ListConstruct %int4_1307, %int1_1308, %int1_1309, %int1_1310 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1962 = torch.aten.repeat %1960, %1961 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %1962, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %1963 = torch.aten.mul.Tensor %1903, %1956 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1963, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int3_1311 = torch.constant.int 3
    %int0_1312 = torch.constant.int 0
    %int64_1313 = torch.constant.int 64
    %int1_1314 = torch.constant.int 1
    %1964 = torch.aten.slice.Tensor %1903, %int3_1311, %int0_1312, %int64_1313, %int1_1314 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1964, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %int3_1315 = torch.constant.int 3
    %int64_1316 = torch.constant.int 64
    %int9223372036854775807_1317 = torch.constant.int 9223372036854775807
    %int1_1318 = torch.constant.int 1
    %1965 = torch.aten.slice.Tensor %1903, %int3_1315, %int64_1316, %int9223372036854775807_1317, %int1_1318 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1965, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %1966 = torch.aten.neg %1965 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1966, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %1967 = torch.prim.ListConstruct %1966, %1964 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_1319 = torch.constant.int -1
    %1968 = torch.aten.cat %1967, %int-1_1319 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1968, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %1969 = torch.aten.mul.Tensor %1968, %1962 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1969, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_1320 = torch.constant.int 1
    %1970 = torch.aten.add.Tensor %1963, %1969, %int1_1320 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1970, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int131072_1321 = torch.constant.int 131072
    %none_1322 = torch.constant.none
    %none_1323 = torch.constant.none
    %cpu_1324 = torch.constant.device "cpu"
    %false_1325 = torch.constant.bool false
    %1971 = torch.aten.arange %int131072_1321, %none_1322, %none_1323, %cpu_1324, %false_1325 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1326 = torch.constant.int 0
    %int128_1327 = torch.constant.int 128
    %int2_1328 = torch.constant.int 2
    %int4_1329 = torch.constant.int 4
    %none_1330 = torch.constant.none
    %cpu_1331 = torch.constant.device "cpu"
    %false_1332 = torch.constant.bool false
    %1972 = torch.aten.arange.start_step %int0_1326, %int128_1327, %int2_1328, %int4_1329, %none_1330, %cpu_1331, %false_1332 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1333 = torch.constant.int 6
    %1973 = torch.prims.convert_element_type %1972, %int6_1333 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1334 = torch.constant.int 128
    %1974 = torch.aten.div.Scalar %1973, %int128_1334 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1335 = torch.constant.float 5.000000e+05
    %1975 = torch.aten.pow.Scalar %float5.000000e05_1335, %1974 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1976 = torch.aten.reciprocal %1975 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1336 = torch.constant.float 1.000000e+00
    %1977 = torch.aten.mul.Scalar %1976, %float1.000000e00_1336 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1978 = torch.aten.reciprocal %1977 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1337 = torch.constant.float 6.2831853071795862
    %1979 = torch.aten.mul.Scalar %1978, %float6.283190e00_1337 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1338 = torch.constant.float 8.192000e+03
    %1980 = torch.aten.gt.Scalar %1979, %float8.192000e03_1338 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1339 = torch.constant.int 8
    %1981 = torch.aten.div.Scalar %1977, %int8_1339 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1982 = torch.aten.where.self %1980, %1981, %1977 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1983 = torch.aten.reciprocal %1979 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1340 = torch.constant.int 8192
    %1984 = torch.aten.mul.Scalar %1983, %int8192_1340 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1341 = torch.constant.int 1
    %int1_1342 = torch.constant.int 1
    %1985 = torch.aten.sub.Scalar %1984, %int1_1341, %int1_1342 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1343 = torch.constant.int 3
    %1986 = torch.aten.div.Scalar %1985, %int3_1343 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1344 = torch.constant.int 1
    %int1_1345 = torch.constant.int 1
    %1987 = torch.aten.rsub.Scalar %1986, %int1_1344, %int1_1345 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1988 = torch.aten.mul.Tensor %1987, %1982 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1346 = torch.constant.int 8
    %1989 = torch.aten.div.Scalar %1988, %int8_1346 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1990 = torch.aten.mul.Tensor %1986, %1982 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1347 = torch.constant.int 1
    %1991 = torch.aten.add.Tensor %1989, %1990, %int1_1347 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1348 = torch.constant.float 2.048000e+03
    %1992 = torch.aten.lt.Scalar %1979, %float2.048000e03_1348 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1993 = torch.aten.bitwise_not %1992 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1349 = torch.constant.float 8.192000e+03
    %1994 = torch.aten.gt.Scalar %1979, %float8.192000e03_1349 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1995 = torch.aten.bitwise_not %1994 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1996 = torch.aten.mul.Tensor %1993, %1995 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1997 = torch.aten.where.self %1996, %1991, %1982 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1998 = torch.prim.ListConstruct %1997, %1997 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1350 = torch.constant.int -1
    %1999 = torch.aten.cat %1998, %int-1_1350 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1351 = torch.constant.int 6
    %2000 = torch.prims.convert_element_type %1999, %int6_1351 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_1352 = torch.constant.int 1
    %2001 = torch.aten.unsqueeze %1971, %int1_1352 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_1353 = torch.constant.int 6
    %2002 = torch.prims.convert_element_type %2001, %int6_1353 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_1354 = torch.constant.int 0
    %2003 = torch.aten.unsqueeze %2000, %int0_1354 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_1355 = torch.constant.int 6
    %2004 = torch.prims.convert_element_type %2003, %int6_1355 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %2005 = torch.aten.mul.Tensor %2002, %2004 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %2006 = torch.aten.cos %2005 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1356 = torch.constant.int 15
    %2007 = torch.prims.convert_element_type %2006, %int15_1356 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2008 = torch.aten.sin %2005 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1357 = torch.constant.int 15
    %2009 = torch.prims.convert_element_type %2008, %int15_1357 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_1358 = torch.constant.int 0
    %int0_1359 = torch.constant.int 0
    %int1_1360 = torch.constant.int 1
    %2010 = torch.aten.slice.Tensor %2007, %int0_1358, %int0_1359, %777, %int1_1360 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2010, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1361 = torch.constant.int 1
    %int0_1362 = torch.constant.int 0
    %int9223372036854775807_1363 = torch.constant.int 9223372036854775807
    %int1_1364 = torch.constant.int 1
    %2011 = torch.aten.slice.Tensor %2010, %int1_1361, %int0_1362, %int9223372036854775807_1363, %int1_1364 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2011, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1365 = torch.constant.int 0
    %int0_1366 = torch.constant.int 0
    %int1_1367 = torch.constant.int 1
    %2012 = torch.aten.slice.Tensor %2009, %int0_1365, %int0_1366, %777, %int1_1367 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2012, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1368 = torch.constant.int 1
    %int0_1369 = torch.constant.int 0
    %int9223372036854775807_1370 = torch.constant.int 9223372036854775807
    %int1_1371 = torch.constant.int 1
    %2013 = torch.aten.slice.Tensor %2012, %int1_1368, %int0_1369, %int9223372036854775807_1370, %int1_1371 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2013, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1372 = torch.constant.int 0
    %2014 = torch.aten.unsqueeze %2011, %int0_1372 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2014, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1373 = torch.constant.int 1
    %int0_1374 = torch.constant.int 0
    %int9223372036854775807_1375 = torch.constant.int 9223372036854775807
    %int1_1376 = torch.constant.int 1
    %2015 = torch.aten.slice.Tensor %2014, %int1_1373, %int0_1374, %int9223372036854775807_1375, %int1_1376 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2015, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1377 = torch.constant.int 2
    %2016 = torch.aten.unsqueeze %2015, %int2_1377 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2016, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1378 = torch.constant.int 3
    %int0_1379 = torch.constant.int 0
    %int9223372036854775807_1380 = torch.constant.int 9223372036854775807
    %int1_1381 = torch.constant.int 1
    %2017 = torch.aten.slice.Tensor %2016, %int3_1378, %int0_1379, %int9223372036854775807_1380, %int1_1381 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2017, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1382 = torch.constant.int 4
    %int1_1383 = torch.constant.int 1
    %int1_1384 = torch.constant.int 1
    %int1_1385 = torch.constant.int 1
    %2018 = torch.prim.ListConstruct %int4_1382, %int1_1383, %int1_1384, %int1_1385 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2019 = torch.aten.repeat %2017, %2018 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2019, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_1386 = torch.constant.int 0
    %2020 = torch.aten.unsqueeze %2013, %int0_1386 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2020, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1387 = torch.constant.int 1
    %int0_1388 = torch.constant.int 0
    %int9223372036854775807_1389 = torch.constant.int 9223372036854775807
    %int1_1390 = torch.constant.int 1
    %2021 = torch.aten.slice.Tensor %2020, %int1_1387, %int0_1388, %int9223372036854775807_1389, %int1_1390 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2021, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1391 = torch.constant.int 2
    %2022 = torch.aten.unsqueeze %2021, %int2_1391 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2022, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1392 = torch.constant.int 3
    %int0_1393 = torch.constant.int 0
    %int9223372036854775807_1394 = torch.constant.int 9223372036854775807
    %int1_1395 = torch.constant.int 1
    %2023 = torch.aten.slice.Tensor %2022, %int3_1392, %int0_1393, %int9223372036854775807_1394, %int1_1395 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2023, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1396 = torch.constant.int 4
    %int1_1397 = torch.constant.int 1
    %int1_1398 = torch.constant.int 1
    %int1_1399 = torch.constant.int 1
    %2024 = torch.prim.ListConstruct %int4_1396, %int1_1397, %int1_1398, %int1_1399 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2025 = torch.aten.repeat %2023, %2024 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2025, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %2026 = torch.aten.mul.Tensor %1905, %2019 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2026, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int3_1400 = torch.constant.int 3
    %int0_1401 = torch.constant.int 0
    %int64_1402 = torch.constant.int 64
    %int1_1403 = torch.constant.int 1
    %2027 = torch.aten.slice.Tensor %1905, %int3_1400, %int0_1401, %int64_1402, %int1_1403 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2027, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %int3_1404 = torch.constant.int 3
    %int64_1405 = torch.constant.int 64
    %int9223372036854775807_1406 = torch.constant.int 9223372036854775807
    %int1_1407 = torch.constant.int 1
    %2028 = torch.aten.slice.Tensor %1905, %int3_1404, %int64_1405, %int9223372036854775807_1406, %int1_1407 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2028, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %2029 = torch.aten.neg %2028 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2029, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %2030 = torch.prim.ListConstruct %2029, %2027 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_1408 = torch.constant.int -1
    %2031 = torch.aten.cat %2030, %int-1_1408 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2031, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %2032 = torch.aten.mul.Tensor %2031, %2025 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2032, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int1_1409 = torch.constant.int 1
    %2033 = torch.aten.add.Tensor %2026, %2032, %int1_1409 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2033, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int64_1410 = torch.constant.int 64
    %2034 = torch.aten.mul.Scalar %arg2, %int64_1410 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2034, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int6_1411 = torch.constant.int 6
    %int1_1412 = torch.constant.int 1
    %2035 = torch.aten.add.Scalar %2034, %int6_1411, %int1_1412 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2035, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_1413 = torch.constant.int 4
    %int32_1414 = torch.constant.int 32
    %int8_1415 = torch.constant.int 8
    %int128_1416 = torch.constant.int 128
    %2036 = torch.prim.ListConstruct %int4_1413, %775, %int32_1414, %int8_1415, %int128_1416 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2037 = torch.aten.view %2033, %2036 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2037, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_1417 = torch.constant.int 32
    %int8_1418 = torch.constant.int 8
    %int128_1419 = torch.constant.int 128
    %2038 = torch.prim.ListConstruct %997, %int32_1417, %int8_1418, %int128_1419 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2039 = torch.aten.view %2037, %2038 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2039, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2040 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %2041 = torch.aten.view %2035, %2040 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2041, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_1420 = torch.constant.int 26
    %2042 = torch.prims.convert_element_type %2039, %int26_1420 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2042, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1421 = torch.constant.int 1
    %2043 = torch.aten.view.dtype %2042, %int1_1421 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2043, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2044 = torch.aten.detach %2043 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2044, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2045 = torch.aten.detach %2044 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2045, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_1422 = torch.constant.int 32
    %int2_1423 = torch.constant.int 2
    %int32_1424 = torch.constant.int 32
    %int8_1425 = torch.constant.int 8
    %int128_1426 = torch.constant.int 128
    %2046 = torch.prim.ListConstruct %776, %int32_1422, %int2_1423, %int32_1424, %int8_1425, %int128_1426 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2047 = torch.aten.view %1740, %2046 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2047, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1427 = torch.constant.int 32
    %int8_1428 = torch.constant.int 8
    %int128_1429 = torch.constant.int 128
    %2048 = torch.prim.ListConstruct %990, %int32_1427, %int8_1428, %int128_1429 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2049 = torch.aten.view %2047, %2048 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2049, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1430 = torch.constant.int 1
    %2050 = torch.aten.view.dtype %2049, %int1_1430 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2050, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2051 = torch.aten.detach %2050 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2051, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2052 = torch.aten.detach %2051 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2052, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2053 = torch.prim.ListConstruct %2041 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1431 = torch.constant.bool false
    %2054 = torch.aten.index_put %2052, %2053, %2045, %false_1431 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2054, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_1432 = torch.constant.int 26
    %2055 = torch.aten.view.dtype %2054, %int26_1432 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2055, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2056 = torch.aten.detach %2055 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2056, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2057 = torch.aten.detach %2056 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2057, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1433 = torch.constant.int 32
    %int2_1434 = torch.constant.int 2
    %int32_1435 = torch.constant.int 32
    %int8_1436 = torch.constant.int 8
    %int128_1437 = torch.constant.int 128
    %2058 = torch.prim.ListConstruct %776, %int32_1433, %int2_1434, %int32_1435, %int8_1436, %int128_1437 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2059 = torch.aten.view %2057, %2058 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2059, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1438 = torch.constant.int 2097152
    %2060 = torch.prim.ListConstruct %776, %int2097152_1438 : (!torch.int, !torch.int) -> !torch.list<int>
    %2061 = torch.aten.view %2059, %2060 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2061, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int4_1439 = torch.constant.int 4
    %int32_1440 = torch.constant.int 32
    %int8_1441 = torch.constant.int 8
    %int128_1442 = torch.constant.int 128
    %2062 = torch.prim.ListConstruct %int4_1439, %775, %int32_1440, %int8_1441, %int128_1442 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2063 = torch.aten.view %1907, %2062 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2063, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_1443 = torch.constant.int 32
    %int8_1444 = torch.constant.int 8
    %int128_1445 = torch.constant.int 128
    %2064 = torch.prim.ListConstruct %997, %int32_1443, %int8_1444, %int128_1445 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2065 = torch.aten.view %2063, %2064 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2065, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1446 = torch.constant.int 1
    %int1_1447 = torch.constant.int 1
    %2066 = torch.aten.add.Scalar %2035, %int1_1446, %int1_1447 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2066, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %2067 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %2068 = torch.aten.view %2066, %2067 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2068, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_1448 = torch.constant.int 26
    %2069 = torch.prims.convert_element_type %2065, %int26_1448 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2069, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1449 = torch.constant.int 1
    %2070 = torch.aten.view.dtype %2069, %int1_1449 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2070, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2071 = torch.aten.detach %2070 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2071, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2072 = torch.aten.detach %2071 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2072, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_1450 = torch.constant.int 32
    %int2_1451 = torch.constant.int 2
    %int32_1452 = torch.constant.int 32
    %int8_1453 = torch.constant.int 8
    %int128_1454 = torch.constant.int 128
    %2073 = torch.prim.ListConstruct %776, %int32_1450, %int2_1451, %int32_1452, %int8_1453, %int128_1454 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2074 = torch.aten.view %2061, %2073 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2074, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1455 = torch.constant.int 32
    %int8_1456 = torch.constant.int 8
    %int128_1457 = torch.constant.int 128
    %2075 = torch.prim.ListConstruct %990, %int32_1455, %int8_1456, %int128_1457 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2076 = torch.aten.view %2074, %2075 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2076, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1458 = torch.constant.int 1
    %2077 = torch.aten.view.dtype %2076, %int1_1458 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2077, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2078 = torch.aten.detach %2077 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2078, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2079 = torch.aten.detach %2078 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2079, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2080 = torch.prim.ListConstruct %2068 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1459 = torch.constant.bool false
    %2081 = torch.aten.index_put %2079, %2080, %2072, %false_1459 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2081, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_1460 = torch.constant.int 26
    %2082 = torch.aten.view.dtype %2081, %int26_1460 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2082, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2083 = torch.aten.detach %2082 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2083, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2084 = torch.aten.detach %2083 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2084, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1461 = torch.constant.int 32
    %int2_1462 = torch.constant.int 2
    %int32_1463 = torch.constant.int 32
    %int8_1464 = torch.constant.int 8
    %int128_1465 = torch.constant.int 128
    %2085 = torch.prim.ListConstruct %776, %int32_1461, %int2_1462, %int32_1463, %int8_1464, %int128_1465 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2086 = torch.aten.view %2084, %2085 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2086, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1466 = torch.constant.int 2097152
    %2087 = torch.prim.ListConstruct %776, %int2097152_1466 : (!torch.int, !torch.int) -> !torch.list<int>
    %2088 = torch.aten.view %2086, %2087 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2088, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_1467 = torch.constant.int -2
    %2089 = torch.aten.unsqueeze %2033, %int-2_1467 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2089, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_1468 = torch.constant.int 4
    %int8_1469 = torch.constant.int 8
    %int4_1470 = torch.constant.int 4
    %int128_1471 = torch.constant.int 128
    %2090 = torch.prim.ListConstruct %int4_1468, %777, %int8_1469, %int4_1470, %int128_1471 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1472 = torch.constant.bool false
    %2091 = torch.aten.expand %2089, %2090, %false_1472 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2091, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_1473 = torch.constant.int 0
    %2092 = torch.aten.clone %2091, %int0_1473 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2092, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_1474 = torch.constant.int 4
    %int32_1475 = torch.constant.int 32
    %int128_1476 = torch.constant.int 128
    %2093 = torch.prim.ListConstruct %int4_1474, %777, %int32_1475, %int128_1476 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2094 = torch.aten._unsafe_view %2092, %2093 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2094, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int-2_1477 = torch.constant.int -2
    %2095 = torch.aten.unsqueeze %1907, %int-2_1477 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2095, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_1478 = torch.constant.int 4
    %int8_1479 = torch.constant.int 8
    %int4_1480 = torch.constant.int 4
    %int128_1481 = torch.constant.int 128
    %2096 = torch.prim.ListConstruct %int4_1478, %777, %int8_1479, %int4_1480, %int128_1481 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1482 = torch.constant.bool false
    %2097 = torch.aten.expand %2095, %2096, %false_1482 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2097, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_1483 = torch.constant.int 0
    %2098 = torch.aten.clone %2097, %int0_1483 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2098, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_1484 = torch.constant.int 4
    %int32_1485 = torch.constant.int 32
    %int128_1486 = torch.constant.int 128
    %2099 = torch.prim.ListConstruct %int4_1484, %777, %int32_1485, %int128_1486 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2100 = torch.aten._unsafe_view %2098, %2099 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2100, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_1487 = torch.constant.int 1
    %int2_1488 = torch.constant.int 2
    %2101 = torch.aten.transpose.int %1970, %int1_1487, %int2_1488 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2101, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_1489 = torch.constant.int 1
    %int2_1490 = torch.constant.int 2
    %2102 = torch.aten.transpose.int %2094, %int1_1489, %int2_1490 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2102, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_1491 = torch.constant.int 1
    %int2_1492 = torch.constant.int 2
    %2103 = torch.aten.transpose.int %2100, %int1_1491, %int2_1492 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2103, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1493 = torch.constant.int 26
    %2104 = torch.prims.convert_element_type %2101, %int26_1493 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2104, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1494 = torch.constant.int 26
    %2105 = torch.prims.convert_element_type %2102, %int26_1494 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2105, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1495 = torch.constant.int 26
    %2106 = torch.prims.convert_element_type %2103, %int26_1495 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2106, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1496 = torch.constant.int 26
    %2107 = torch.prims.convert_element_type %803, %int26_1496 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2107, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    %int0_1497 = torch.constant.int 0
    %int0_1498 = torch.constant.int 0
    %2108 = torch.aten.select.int %2107, %int0_1497, %int0_1498 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2108, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    %int0_1499 = torch.constant.int 0
    %int0_1500 = torch.constant.int 0
    %2109 = torch.aten.select.int %2108, %int0_1499, %int0_1500 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2109, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int0_1501 = torch.constant.int 0
    %int0_1502 = torch.constant.int 0
    %int9223372036854775807_1503 = torch.constant.int 9223372036854775807
    %int1_1504 = torch.constant.int 1
    %2110 = torch.aten.slice.Tensor %2109, %int0_1501, %int0_1502, %int9223372036854775807_1503, %int1_1504 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2110, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int1_1505 = torch.constant.int 1
    %int0_1506 = torch.constant.int 0
    %int9223372036854775807_1507 = torch.constant.int 9223372036854775807
    %int1_1508 = torch.constant.int 1
    %2111 = torch.aten.slice.Tensor %2110, %int1_1505, %int0_1506, %int9223372036854775807_1507, %int1_1508 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2111, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %none_1509 = torch.constant.none
    %2112 = torch.aten.clone %83, %none_1509 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
    %2113 = torch.aten.detach %2112 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2114 = torch.aten.detach %2113 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2115 = torch.aten.detach %2114 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2116 = torch_c.to_builtin_tensor %2104 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2117 = torch_c.to_builtin_tensor %2105 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2118 = torch_c.to_builtin_tensor %2106 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2119 = torch_c.to_builtin_tensor %2111 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ>
    %2120 = torch_c.to_builtin_tensor %2115 : !torch.vtensor<[],f32> -> tensor<f32>
    %2121 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%2116, %2117, %2118, %2120, %2119) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %2122 = torch_c.from_builtin_tensor %2121 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32>
    torch.bind_symbolic_shape %2122, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32>
    %int1_1510 = torch.constant.int 1
    %int2_1511 = torch.constant.int 2
    %2123 = torch.aten.transpose.int %2122, %int1_1510, %int2_1511 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %2123, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int0_1512 = torch.constant.int 0
    %2124 = torch.aten.clone %2123, %int0_1512 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %2124, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int4_1513 = torch.constant.int 4
    %int4096_1514 = torch.constant.int 4096
    %2125 = torch.prim.ListConstruct %int4_1513, %777, %int4096_1514 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2126 = torch.aten._unsafe_view %2124, %2125 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2126, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2127 = torch.aten.div.Tensor %2126, %84 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2127, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1515 = torch.constant.float -2.400000e+02
    %float2.400000e02_1516 = torch.constant.float 2.400000e+02
    %2128 = torch.aten.clamp %2127, %float-2.400000e02_1515, %float2.400000e02_1516 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2128, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1517 = torch.constant.int 26
    %2129 = torch.prims.convert_element_type %2128, %int26_1517 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2129, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1518 = torch.constant.int 0
    %2130 = torch.aten.unsqueeze %85, %int0_1518 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_1519 = torch.constant.int 4
    %int4096_1520 = torch.constant.int 4096
    %int4096_1521 = torch.constant.int 4096
    %2131 = torch.prim.ListConstruct %int4_1519, %int4096_1520, %int4096_1521 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1522 = torch.constant.bool false
    %2132 = torch.aten.expand %2130, %2131, %false_1522 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2133 = torch_c.to_builtin_tensor %2129 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2134 = torch_c.to_builtin_tensor %2132 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2135 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2133, %2134) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2136 = torch_c.from_builtin_tensor %2135 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2136, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2137 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2138 = torch.aten.permute %86, %2137 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2139 = torch.aten.mul.Tensor %84, %2138 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1523 = torch.constant.int 6
    %2140 = torch.prims.convert_element_type %2136, %int6_1523 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2140, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2141 = torch.aten.mul.Tensor %2140, %2139 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2141, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_1524 = torch.constant.int 1
    %2142 = torch.aten.add.Tensor %1852, %2141, %int1_1524 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2142, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1525 = torch.constant.int 6
    %2143 = torch.prims.convert_element_type %2142, %int6_1525 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2143, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_1526 = torch.constant.int 2
    %2144 = torch.aten.pow.Tensor_Scalar %2143, %int2_1526 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2144, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_1527 = torch.constant.int -1
    %2145 = torch.prim.ListConstruct %int-1_1527 : (!torch.int) -> !torch.list<int>
    %true_1528 = torch.constant.bool true
    %none_1529 = torch.constant.none
    %2146 = torch.aten.mean.dim %2144, %2145, %true_1528, %none_1529 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2146, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_1530 = torch.constant.float 1.000000e-05
    %int1_1531 = torch.constant.int 1
    %2147 = torch.aten.add.Scalar %2146, %float1.000000e-05_1530, %int1_1531 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2147, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2148 = torch.aten.rsqrt %2147 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2148, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2149 = torch.aten.mul.Tensor %2143, %2148 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2149, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1532 = torch.constant.int 6
    %2150 = torch.prims.convert_element_type %2149, %int6_1532 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2150, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2151 = torch.aten.mul.Tensor %87, %2150 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2151, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1533 = torch.constant.int 6
    %2152 = torch.prims.convert_element_type %2151, %int6_1533 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2152, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2153 = torch.aten.div.Tensor %2152, %88 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2153, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1534 = torch.constant.float -2.400000e+02
    %float2.400000e02_1535 = torch.constant.float 2.400000e+02
    %2154 = torch.aten.clamp %2153, %float-2.400000e02_1534, %float2.400000e02_1535 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2154, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1536 = torch.constant.int 26
    %2155 = torch.prims.convert_element_type %2154, %int26_1536 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2155, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1537 = torch.constant.int 0
    %2156 = torch.aten.unsqueeze %89, %int0_1537 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_1538 = torch.constant.int 4
    %int14336_1539 = torch.constant.int 14336
    %int4096_1540 = torch.constant.int 4096
    %2157 = torch.prim.ListConstruct %int4_1538, %int14336_1539, %int4096_1540 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1541 = torch.constant.bool false
    %2158 = torch.aten.expand %2156, %2157, %false_1541 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2159 = torch_c.to_builtin_tensor %2155 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2160 = torch_c.to_builtin_tensor %2158 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2161 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2159, %2160) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %2162 = torch_c.from_builtin_tensor %2161 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2162, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2163 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2164 = torch.aten.permute %90, %2163 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2165 = torch.aten.mul.Tensor %88, %2164 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1542 = torch.constant.int 6
    %2166 = torch.prims.convert_element_type %2162, %int6_1542 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2166, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2167 = torch.aten.mul.Tensor %2166, %2165 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2167, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2168 = torch.aten.silu %2167 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2168, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2169 = torch.aten.div.Tensor %2152, %91 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2169, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1543 = torch.constant.float -2.400000e+02
    %float2.400000e02_1544 = torch.constant.float 2.400000e+02
    %2170 = torch.aten.clamp %2169, %float-2.400000e02_1543, %float2.400000e02_1544 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2170, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1545 = torch.constant.int 26
    %2171 = torch.prims.convert_element_type %2170, %int26_1545 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2171, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1546 = torch.constant.int 0
    %2172 = torch.aten.unsqueeze %92, %int0_1546 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_1547 = torch.constant.int 4
    %int14336_1548 = torch.constant.int 14336
    %int4096_1549 = torch.constant.int 4096
    %2173 = torch.prim.ListConstruct %int4_1547, %int14336_1548, %int4096_1549 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1550 = torch.constant.bool false
    %2174 = torch.aten.expand %2172, %2173, %false_1550 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2175 = torch_c.to_builtin_tensor %2171 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2176 = torch_c.to_builtin_tensor %2174 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2177 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2175, %2176) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %2178 = torch_c.from_builtin_tensor %2177 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2178, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2179 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2180 = torch.aten.permute %93, %2179 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2181 = torch.aten.mul.Tensor %91, %2180 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1551 = torch.constant.int 6
    %2182 = torch.prims.convert_element_type %2178, %int6_1551 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2182, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2183 = torch.aten.mul.Tensor %2182, %2181 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2183, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2184 = torch.aten.mul.Tensor %2168, %2183 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2184, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2185 = torch.aten.div.Tensor %2184, %94 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2185, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %float-2.400000e02_1552 = torch.constant.float -2.400000e+02
    %float2.400000e02_1553 = torch.constant.float 2.400000e+02
    %2186 = torch.aten.clamp %2185, %float-2.400000e02_1552, %float2.400000e02_1553 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2186, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %int26_1554 = torch.constant.int 26
    %2187 = torch.prims.convert_element_type %2186, %int26_1554 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2187, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    %int0_1555 = torch.constant.int 0
    %2188 = torch.aten.unsqueeze %95, %int0_1555 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %int4_1556 = torch.constant.int 4
    %int4096_1557 = torch.constant.int 4096
    %int14336_1558 = torch.constant.int 14336
    %2189 = torch.prim.ListConstruct %int4_1556, %int4096_1557, %int14336_1558 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1559 = torch.constant.bool false
    %2190 = torch.aten.expand %2188, %2189, %false_1559 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %2191 = torch_c.to_builtin_tensor %2187 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ>
    %2192 = torch_c.to_builtin_tensor %2190 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ>
    %2193 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%2191, %2192) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2194 = torch_c.from_builtin_tensor %2193 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2194, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2195 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2196 = torch.aten.permute %96, %2195 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2197 = torch.aten.mul.Tensor %94, %2196 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1560 = torch.constant.int 6
    %2198 = torch.prims.convert_element_type %2194, %int6_1560 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2198, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2199 = torch.aten.mul.Tensor %2198, %2197 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2199, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_1561 = torch.constant.int 1
    %2200 = torch.aten.add.Tensor %2142, %2199, %int1_1561 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2200, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1562 = torch.constant.int 6
    %2201 = torch.prims.convert_element_type %2200, %int6_1562 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2201, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_1563 = torch.constant.int 2
    %2202 = torch.aten.pow.Tensor_Scalar %2201, %int2_1563 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2202, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_1564 = torch.constant.int -1
    %2203 = torch.prim.ListConstruct %int-1_1564 : (!torch.int) -> !torch.list<int>
    %true_1565 = torch.constant.bool true
    %none_1566 = torch.constant.none
    %2204 = torch.aten.mean.dim %2202, %2203, %true_1565, %none_1566 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2204, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_1567 = torch.constant.float 1.000000e-05
    %int1_1568 = torch.constant.int 1
    %2205 = torch.aten.add.Scalar %2204, %float1.000000e-05_1567, %int1_1568 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2205, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2206 = torch.aten.rsqrt %2205 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2206, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2207 = torch.aten.mul.Tensor %2201, %2206 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2207, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1569 = torch.constant.int 6
    %2208 = torch.prims.convert_element_type %2207, %int6_1569 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2208, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2209 = torch.aten.mul.Tensor %97, %2208 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2209, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1570 = torch.constant.int 6
    %2210 = torch.prims.convert_element_type %2209, %int6_1570 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2210, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2211 = torch.aten.div.Tensor %2210, %98 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2211, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1571 = torch.constant.float -2.400000e+02
    %float2.400000e02_1572 = torch.constant.float 2.400000e+02
    %2212 = torch.aten.clamp %2211, %float-2.400000e02_1571, %float2.400000e02_1572 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2212, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1573 = torch.constant.int 26
    %2213 = torch.prims.convert_element_type %2212, %int26_1573 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2213, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1574 = torch.constant.int 0
    %2214 = torch.aten.unsqueeze %99, %int0_1574 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_1575 = torch.constant.int 4
    %int4096_1576 = torch.constant.int 4096
    %int4096_1577 = torch.constant.int 4096
    %2215 = torch.prim.ListConstruct %int4_1575, %int4096_1576, %int4096_1577 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1578 = torch.constant.bool false
    %2216 = torch.aten.expand %2214, %2215, %false_1578 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2217 = torch_c.to_builtin_tensor %2213 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2218 = torch_c.to_builtin_tensor %2216 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2219 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2217, %2218) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2220 = torch_c.from_builtin_tensor %2219 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2220, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2221 = torch.aten.div.Tensor %2220, %100 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2221, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1579 = torch.constant.float -2.400000e+02
    %float2.400000e02_1580 = torch.constant.float 2.400000e+02
    %2222 = torch.aten.clamp %2221, %float-2.400000e02_1579, %float2.400000e02_1580 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2222, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1581 = torch.constant.int 26
    %2223 = torch.prims.convert_element_type %2222, %int26_1581 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2223, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %2224 = torch.aten.div.Tensor %2210, %101 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2224, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1582 = torch.constant.float -2.400000e+02
    %float2.400000e02_1583 = torch.constant.float 2.400000e+02
    %2225 = torch.aten.clamp %2224, %float-2.400000e02_1582, %float2.400000e02_1583 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2225, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1584 = torch.constant.int 26
    %2226 = torch.prims.convert_element_type %2225, %int26_1584 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2226, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1585 = torch.constant.int 0
    %2227 = torch.aten.unsqueeze %102, %int0_1585 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_1586 = torch.constant.int 4
    %int1024_1587 = torch.constant.int 1024
    %int4096_1588 = torch.constant.int 4096
    %2228 = torch.prim.ListConstruct %int4_1586, %int1024_1587, %int4096_1588 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1589 = torch.constant.bool false
    %2229 = torch.aten.expand %2227, %2228, %false_1589 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2230 = torch_c.to_builtin_tensor %2226 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2231 = torch_c.to_builtin_tensor %2229 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2232 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2230, %2231) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %2233 = torch_c.from_builtin_tensor %2232 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2233, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %2234 = torch.aten.div.Tensor %2233, %103 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2234, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_1590 = torch.constant.float -2.400000e+02
    %float2.400000e02_1591 = torch.constant.float 2.400000e+02
    %2235 = torch.aten.clamp %2234, %float-2.400000e02_1590, %float2.400000e02_1591 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2235, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_1592 = torch.constant.int 26
    %2236 = torch.prims.convert_element_type %2235, %int26_1592 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2236, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %2237 = torch.aten.div.Tensor %2210, %104 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2237, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1593 = torch.constant.float -2.400000e+02
    %float2.400000e02_1594 = torch.constant.float 2.400000e+02
    %2238 = torch.aten.clamp %2237, %float-2.400000e02_1593, %float2.400000e02_1594 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2238, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1595 = torch.constant.int 26
    %2239 = torch.prims.convert_element_type %2238, %int26_1595 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2239, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1596 = torch.constant.int 0
    %2240 = torch.aten.unsqueeze %105, %int0_1596 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_1597 = torch.constant.int 4
    %int1024_1598 = torch.constant.int 1024
    %int4096_1599 = torch.constant.int 4096
    %2241 = torch.prim.ListConstruct %int4_1597, %int1024_1598, %int4096_1599 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1600 = torch.constant.bool false
    %2242 = torch.aten.expand %2240, %2241, %false_1600 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2243 = torch_c.to_builtin_tensor %2239 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2244 = torch_c.to_builtin_tensor %2242 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2245 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2243, %2244) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %2246 = torch_c.from_builtin_tensor %2245 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2246, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %2247 = torch.aten.div.Tensor %2246, %106 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2247, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_1601 = torch.constant.float -2.400000e+02
    %float2.400000e02_1602 = torch.constant.float 2.400000e+02
    %2248 = torch.aten.clamp %2247, %float-2.400000e02_1601, %float2.400000e02_1602 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2248, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_1603 = torch.constant.int 26
    %2249 = torch.prims.convert_element_type %2248, %int26_1603 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2249, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %int4_1604 = torch.constant.int 4
    %int32_1605 = torch.constant.int 32
    %int128_1606 = torch.constant.int 128
    %2250 = torch.prim.ListConstruct %int4_1604, %777, %int32_1605, %int128_1606 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2251 = torch.aten.view %2223, %2250 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2251, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int4_1607 = torch.constant.int 4
    %int8_1608 = torch.constant.int 8
    %int128_1609 = torch.constant.int 128
    %2252 = torch.prim.ListConstruct %int4_1607, %777, %int8_1608, %int128_1609 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2253 = torch.aten.view %2236, %2252 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2253, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int4_1610 = torch.constant.int 4
    %int8_1611 = torch.constant.int 8
    %int128_1612 = torch.constant.int 128
    %2254 = torch.prim.ListConstruct %int4_1610, %777, %int8_1611, %int128_1612 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2255 = torch.aten.view %2249, %2254 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2255, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int131072_1613 = torch.constant.int 131072
    %none_1614 = torch.constant.none
    %none_1615 = torch.constant.none
    %cpu_1616 = torch.constant.device "cpu"
    %false_1617 = torch.constant.bool false
    %2256 = torch.aten.arange %int131072_1613, %none_1614, %none_1615, %cpu_1616, %false_1617 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1618 = torch.constant.int 0
    %int128_1619 = torch.constant.int 128
    %int2_1620 = torch.constant.int 2
    %int4_1621 = torch.constant.int 4
    %none_1622 = torch.constant.none
    %cpu_1623 = torch.constant.device "cpu"
    %false_1624 = torch.constant.bool false
    %2257 = torch.aten.arange.start_step %int0_1618, %int128_1619, %int2_1620, %int4_1621, %none_1622, %cpu_1623, %false_1624 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1625 = torch.constant.int 6
    %2258 = torch.prims.convert_element_type %2257, %int6_1625 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1626 = torch.constant.int 128
    %2259 = torch.aten.div.Scalar %2258, %int128_1626 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1627 = torch.constant.float 5.000000e+05
    %2260 = torch.aten.pow.Scalar %float5.000000e05_1627, %2259 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2261 = torch.aten.reciprocal %2260 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1628 = torch.constant.float 1.000000e+00
    %2262 = torch.aten.mul.Scalar %2261, %float1.000000e00_1628 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2263 = torch.aten.reciprocal %2262 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1629 = torch.constant.float 6.2831853071795862
    %2264 = torch.aten.mul.Scalar %2263, %float6.283190e00_1629 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1630 = torch.constant.float 8.192000e+03
    %2265 = torch.aten.gt.Scalar %2264, %float8.192000e03_1630 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1631 = torch.constant.int 8
    %2266 = torch.aten.div.Scalar %2262, %int8_1631 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2267 = torch.aten.where.self %2265, %2266, %2262 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2268 = torch.aten.reciprocal %2264 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1632 = torch.constant.int 8192
    %2269 = torch.aten.mul.Scalar %2268, %int8192_1632 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1633 = torch.constant.int 1
    %int1_1634 = torch.constant.int 1
    %2270 = torch.aten.sub.Scalar %2269, %int1_1633, %int1_1634 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1635 = torch.constant.int 3
    %2271 = torch.aten.div.Scalar %2270, %int3_1635 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1636 = torch.constant.int 1
    %int1_1637 = torch.constant.int 1
    %2272 = torch.aten.rsub.Scalar %2271, %int1_1636, %int1_1637 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2273 = torch.aten.mul.Tensor %2272, %2267 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1638 = torch.constant.int 8
    %2274 = torch.aten.div.Scalar %2273, %int8_1638 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2275 = torch.aten.mul.Tensor %2271, %2267 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1639 = torch.constant.int 1
    %2276 = torch.aten.add.Tensor %2274, %2275, %int1_1639 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1640 = torch.constant.float 2.048000e+03
    %2277 = torch.aten.lt.Scalar %2264, %float2.048000e03_1640 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2278 = torch.aten.bitwise_not %2277 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1641 = torch.constant.float 8.192000e+03
    %2279 = torch.aten.gt.Scalar %2264, %float8.192000e03_1641 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2280 = torch.aten.bitwise_not %2279 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2281 = torch.aten.mul.Tensor %2278, %2280 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2282 = torch.aten.where.self %2281, %2276, %2267 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2283 = torch.prim.ListConstruct %2282, %2282 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1642 = torch.constant.int -1
    %2284 = torch.aten.cat %2283, %int-1_1642 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1643 = torch.constant.int 6
    %2285 = torch.prims.convert_element_type %2284, %int6_1643 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_1644 = torch.constant.int 1
    %2286 = torch.aten.unsqueeze %2256, %int1_1644 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_1645 = torch.constant.int 6
    %2287 = torch.prims.convert_element_type %2286, %int6_1645 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_1646 = torch.constant.int 0
    %2288 = torch.aten.unsqueeze %2285, %int0_1646 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_1647 = torch.constant.int 6
    %2289 = torch.prims.convert_element_type %2288, %int6_1647 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %2290 = torch.aten.mul.Tensor %2287, %2289 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %2291 = torch.aten.cos %2290 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1648 = torch.constant.int 15
    %2292 = torch.prims.convert_element_type %2291, %int15_1648 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2293 = torch.aten.sin %2290 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1649 = torch.constant.int 15
    %2294 = torch.prims.convert_element_type %2293, %int15_1649 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_1650 = torch.constant.int 0
    %int0_1651 = torch.constant.int 0
    %int1_1652 = torch.constant.int 1
    %2295 = torch.aten.slice.Tensor %2292, %int0_1650, %int0_1651, %777, %int1_1652 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2295, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1653 = torch.constant.int 1
    %int0_1654 = torch.constant.int 0
    %int9223372036854775807_1655 = torch.constant.int 9223372036854775807
    %int1_1656 = torch.constant.int 1
    %2296 = torch.aten.slice.Tensor %2295, %int1_1653, %int0_1654, %int9223372036854775807_1655, %int1_1656 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2296, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1657 = torch.constant.int 0
    %int0_1658 = torch.constant.int 0
    %int1_1659 = torch.constant.int 1
    %2297 = torch.aten.slice.Tensor %2294, %int0_1657, %int0_1658, %777, %int1_1659 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2297, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1660 = torch.constant.int 1
    %int0_1661 = torch.constant.int 0
    %int9223372036854775807_1662 = torch.constant.int 9223372036854775807
    %int1_1663 = torch.constant.int 1
    %2298 = torch.aten.slice.Tensor %2297, %int1_1660, %int0_1661, %int9223372036854775807_1662, %int1_1663 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2298, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1664 = torch.constant.int 0
    %2299 = torch.aten.unsqueeze %2296, %int0_1664 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2299, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1665 = torch.constant.int 1
    %int0_1666 = torch.constant.int 0
    %int9223372036854775807_1667 = torch.constant.int 9223372036854775807
    %int1_1668 = torch.constant.int 1
    %2300 = torch.aten.slice.Tensor %2299, %int1_1665, %int0_1666, %int9223372036854775807_1667, %int1_1668 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2300, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1669 = torch.constant.int 2
    %2301 = torch.aten.unsqueeze %2300, %int2_1669 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2301, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1670 = torch.constant.int 3
    %int0_1671 = torch.constant.int 0
    %int9223372036854775807_1672 = torch.constant.int 9223372036854775807
    %int1_1673 = torch.constant.int 1
    %2302 = torch.aten.slice.Tensor %2301, %int3_1670, %int0_1671, %int9223372036854775807_1672, %int1_1673 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2302, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1674 = torch.constant.int 4
    %int1_1675 = torch.constant.int 1
    %int1_1676 = torch.constant.int 1
    %int1_1677 = torch.constant.int 1
    %2303 = torch.prim.ListConstruct %int4_1674, %int1_1675, %int1_1676, %int1_1677 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2304 = torch.aten.repeat %2302, %2303 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2304, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_1678 = torch.constant.int 0
    %2305 = torch.aten.unsqueeze %2298, %int0_1678 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2305, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1679 = torch.constant.int 1
    %int0_1680 = torch.constant.int 0
    %int9223372036854775807_1681 = torch.constant.int 9223372036854775807
    %int1_1682 = torch.constant.int 1
    %2306 = torch.aten.slice.Tensor %2305, %int1_1679, %int0_1680, %int9223372036854775807_1681, %int1_1682 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2306, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1683 = torch.constant.int 2
    %2307 = torch.aten.unsqueeze %2306, %int2_1683 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2307, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1684 = torch.constant.int 3
    %int0_1685 = torch.constant.int 0
    %int9223372036854775807_1686 = torch.constant.int 9223372036854775807
    %int1_1687 = torch.constant.int 1
    %2308 = torch.aten.slice.Tensor %2307, %int3_1684, %int0_1685, %int9223372036854775807_1686, %int1_1687 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2308, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1688 = torch.constant.int 4
    %int1_1689 = torch.constant.int 1
    %int1_1690 = torch.constant.int 1
    %int1_1691 = torch.constant.int 1
    %2309 = torch.prim.ListConstruct %int4_1688, %int1_1689, %int1_1690, %int1_1691 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2310 = torch.aten.repeat %2308, %2309 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2310, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %2311 = torch.aten.mul.Tensor %2251, %2304 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2311, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int3_1692 = torch.constant.int 3
    %int0_1693 = torch.constant.int 0
    %int64_1694 = torch.constant.int 64
    %int1_1695 = torch.constant.int 1
    %2312 = torch.aten.slice.Tensor %2251, %int3_1692, %int0_1693, %int64_1694, %int1_1695 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2312, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %int3_1696 = torch.constant.int 3
    %int64_1697 = torch.constant.int 64
    %int9223372036854775807_1698 = torch.constant.int 9223372036854775807
    %int1_1699 = torch.constant.int 1
    %2313 = torch.aten.slice.Tensor %2251, %int3_1696, %int64_1697, %int9223372036854775807_1698, %int1_1699 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2313, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %2314 = torch.aten.neg %2313 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2314, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %2315 = torch.prim.ListConstruct %2314, %2312 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_1700 = torch.constant.int -1
    %2316 = torch.aten.cat %2315, %int-1_1700 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2316, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %2317 = torch.aten.mul.Tensor %2316, %2310 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2317, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_1701 = torch.constant.int 1
    %2318 = torch.aten.add.Tensor %2311, %2317, %int1_1701 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2318, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int131072_1702 = torch.constant.int 131072
    %none_1703 = torch.constant.none
    %none_1704 = torch.constant.none
    %cpu_1705 = torch.constant.device "cpu"
    %false_1706 = torch.constant.bool false
    %2319 = torch.aten.arange %int131072_1702, %none_1703, %none_1704, %cpu_1705, %false_1706 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1707 = torch.constant.int 0
    %int128_1708 = torch.constant.int 128
    %int2_1709 = torch.constant.int 2
    %int4_1710 = torch.constant.int 4
    %none_1711 = torch.constant.none
    %cpu_1712 = torch.constant.device "cpu"
    %false_1713 = torch.constant.bool false
    %2320 = torch.aten.arange.start_step %int0_1707, %int128_1708, %int2_1709, %int4_1710, %none_1711, %cpu_1712, %false_1713 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1714 = torch.constant.int 6
    %2321 = torch.prims.convert_element_type %2320, %int6_1714 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1715 = torch.constant.int 128
    %2322 = torch.aten.div.Scalar %2321, %int128_1715 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1716 = torch.constant.float 5.000000e+05
    %2323 = torch.aten.pow.Scalar %float5.000000e05_1716, %2322 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2324 = torch.aten.reciprocal %2323 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1717 = torch.constant.float 1.000000e+00
    %2325 = torch.aten.mul.Scalar %2324, %float1.000000e00_1717 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2326 = torch.aten.reciprocal %2325 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1718 = torch.constant.float 6.2831853071795862
    %2327 = torch.aten.mul.Scalar %2326, %float6.283190e00_1718 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1719 = torch.constant.float 8.192000e+03
    %2328 = torch.aten.gt.Scalar %2327, %float8.192000e03_1719 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1720 = torch.constant.int 8
    %2329 = torch.aten.div.Scalar %2325, %int8_1720 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2330 = torch.aten.where.self %2328, %2329, %2325 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2331 = torch.aten.reciprocal %2327 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1721 = torch.constant.int 8192
    %2332 = torch.aten.mul.Scalar %2331, %int8192_1721 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1722 = torch.constant.int 1
    %int1_1723 = torch.constant.int 1
    %2333 = torch.aten.sub.Scalar %2332, %int1_1722, %int1_1723 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1724 = torch.constant.int 3
    %2334 = torch.aten.div.Scalar %2333, %int3_1724 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1725 = torch.constant.int 1
    %int1_1726 = torch.constant.int 1
    %2335 = torch.aten.rsub.Scalar %2334, %int1_1725, %int1_1726 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2336 = torch.aten.mul.Tensor %2335, %2330 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1727 = torch.constant.int 8
    %2337 = torch.aten.div.Scalar %2336, %int8_1727 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2338 = torch.aten.mul.Tensor %2334, %2330 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1728 = torch.constant.int 1
    %2339 = torch.aten.add.Tensor %2337, %2338, %int1_1728 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1729 = torch.constant.float 2.048000e+03
    %2340 = torch.aten.lt.Scalar %2327, %float2.048000e03_1729 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2341 = torch.aten.bitwise_not %2340 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1730 = torch.constant.float 8.192000e+03
    %2342 = torch.aten.gt.Scalar %2327, %float8.192000e03_1730 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2343 = torch.aten.bitwise_not %2342 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2344 = torch.aten.mul.Tensor %2341, %2343 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2345 = torch.aten.where.self %2344, %2339, %2330 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2346 = torch.prim.ListConstruct %2345, %2345 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1731 = torch.constant.int -1
    %2347 = torch.aten.cat %2346, %int-1_1731 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1732 = torch.constant.int 6
    %2348 = torch.prims.convert_element_type %2347, %int6_1732 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_1733 = torch.constant.int 1
    %2349 = torch.aten.unsqueeze %2319, %int1_1733 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_1734 = torch.constant.int 6
    %2350 = torch.prims.convert_element_type %2349, %int6_1734 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_1735 = torch.constant.int 0
    %2351 = torch.aten.unsqueeze %2348, %int0_1735 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_1736 = torch.constant.int 6
    %2352 = torch.prims.convert_element_type %2351, %int6_1736 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %2353 = torch.aten.mul.Tensor %2350, %2352 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %2354 = torch.aten.cos %2353 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1737 = torch.constant.int 15
    %2355 = torch.prims.convert_element_type %2354, %int15_1737 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2356 = torch.aten.sin %2353 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1738 = torch.constant.int 15
    %2357 = torch.prims.convert_element_type %2356, %int15_1738 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_1739 = torch.constant.int 0
    %int0_1740 = torch.constant.int 0
    %int1_1741 = torch.constant.int 1
    %2358 = torch.aten.slice.Tensor %2355, %int0_1739, %int0_1740, %777, %int1_1741 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2358, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1742 = torch.constant.int 1
    %int0_1743 = torch.constant.int 0
    %int9223372036854775807_1744 = torch.constant.int 9223372036854775807
    %int1_1745 = torch.constant.int 1
    %2359 = torch.aten.slice.Tensor %2358, %int1_1742, %int0_1743, %int9223372036854775807_1744, %int1_1745 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2359, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1746 = torch.constant.int 0
    %int0_1747 = torch.constant.int 0
    %int1_1748 = torch.constant.int 1
    %2360 = torch.aten.slice.Tensor %2357, %int0_1746, %int0_1747, %777, %int1_1748 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2360, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1749 = torch.constant.int 1
    %int0_1750 = torch.constant.int 0
    %int9223372036854775807_1751 = torch.constant.int 9223372036854775807
    %int1_1752 = torch.constant.int 1
    %2361 = torch.aten.slice.Tensor %2360, %int1_1749, %int0_1750, %int9223372036854775807_1751, %int1_1752 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2361, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1753 = torch.constant.int 0
    %2362 = torch.aten.unsqueeze %2359, %int0_1753 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2362, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1754 = torch.constant.int 1
    %int0_1755 = torch.constant.int 0
    %int9223372036854775807_1756 = torch.constant.int 9223372036854775807
    %int1_1757 = torch.constant.int 1
    %2363 = torch.aten.slice.Tensor %2362, %int1_1754, %int0_1755, %int9223372036854775807_1756, %int1_1757 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2363, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1758 = torch.constant.int 2
    %2364 = torch.aten.unsqueeze %2363, %int2_1758 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2364, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1759 = torch.constant.int 3
    %int0_1760 = torch.constant.int 0
    %int9223372036854775807_1761 = torch.constant.int 9223372036854775807
    %int1_1762 = torch.constant.int 1
    %2365 = torch.aten.slice.Tensor %2364, %int3_1759, %int0_1760, %int9223372036854775807_1761, %int1_1762 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2365, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1763 = torch.constant.int 4
    %int1_1764 = torch.constant.int 1
    %int1_1765 = torch.constant.int 1
    %int1_1766 = torch.constant.int 1
    %2366 = torch.prim.ListConstruct %int4_1763, %int1_1764, %int1_1765, %int1_1766 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2367 = torch.aten.repeat %2365, %2366 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2367, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_1767 = torch.constant.int 0
    %2368 = torch.aten.unsqueeze %2361, %int0_1767 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2368, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1768 = torch.constant.int 1
    %int0_1769 = torch.constant.int 0
    %int9223372036854775807_1770 = torch.constant.int 9223372036854775807
    %int1_1771 = torch.constant.int 1
    %2369 = torch.aten.slice.Tensor %2368, %int1_1768, %int0_1769, %int9223372036854775807_1770, %int1_1771 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2369, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1772 = torch.constant.int 2
    %2370 = torch.aten.unsqueeze %2369, %int2_1772 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2370, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1773 = torch.constant.int 3
    %int0_1774 = torch.constant.int 0
    %int9223372036854775807_1775 = torch.constant.int 9223372036854775807
    %int1_1776 = torch.constant.int 1
    %2371 = torch.aten.slice.Tensor %2370, %int3_1773, %int0_1774, %int9223372036854775807_1775, %int1_1776 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2371, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_1777 = torch.constant.int 4
    %int1_1778 = torch.constant.int 1
    %int1_1779 = torch.constant.int 1
    %int1_1780 = torch.constant.int 1
    %2372 = torch.prim.ListConstruct %int4_1777, %int1_1778, %int1_1779, %int1_1780 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2373 = torch.aten.repeat %2371, %2372 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2373, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %2374 = torch.aten.mul.Tensor %2253, %2367 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2374, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int3_1781 = torch.constant.int 3
    %int0_1782 = torch.constant.int 0
    %int64_1783 = torch.constant.int 64
    %int1_1784 = torch.constant.int 1
    %2375 = torch.aten.slice.Tensor %2253, %int3_1781, %int0_1782, %int64_1783, %int1_1784 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2375, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %int3_1785 = torch.constant.int 3
    %int64_1786 = torch.constant.int 64
    %int9223372036854775807_1787 = torch.constant.int 9223372036854775807
    %int1_1788 = torch.constant.int 1
    %2376 = torch.aten.slice.Tensor %2253, %int3_1785, %int64_1786, %int9223372036854775807_1787, %int1_1788 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2376, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %2377 = torch.aten.neg %2376 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2377, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %2378 = torch.prim.ListConstruct %2377, %2375 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_1789 = torch.constant.int -1
    %2379 = torch.aten.cat %2378, %int-1_1789 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2379, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %2380 = torch.aten.mul.Tensor %2379, %2373 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2380, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int1_1790 = torch.constant.int 1
    %2381 = torch.aten.add.Tensor %2374, %2380, %int1_1790 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2381, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int64_1791 = torch.constant.int 64
    %2382 = torch.aten.mul.Scalar %arg2, %int64_1791 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2382, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int8_1792 = torch.constant.int 8
    %int1_1793 = torch.constant.int 1
    %2383 = torch.aten.add.Scalar %2382, %int8_1792, %int1_1793 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2383, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_1794 = torch.constant.int 4
    %int32_1795 = torch.constant.int 32
    %int8_1796 = torch.constant.int 8
    %int128_1797 = torch.constant.int 128
    %2384 = torch.prim.ListConstruct %int4_1794, %775, %int32_1795, %int8_1796, %int128_1797 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2385 = torch.aten.view %2381, %2384 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2385, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_1798 = torch.constant.int 32
    %int8_1799 = torch.constant.int 8
    %int128_1800 = torch.constant.int 128
    %2386 = torch.prim.ListConstruct %997, %int32_1798, %int8_1799, %int128_1800 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2387 = torch.aten.view %2385, %2386 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2387, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2388 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %2389 = torch.aten.view %2383, %2388 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2389, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_1801 = torch.constant.int 26
    %2390 = torch.prims.convert_element_type %2387, %int26_1801 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2390, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1802 = torch.constant.int 1
    %2391 = torch.aten.view.dtype %2390, %int1_1802 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2391, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2392 = torch.aten.detach %2391 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2392, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2393 = torch.aten.detach %2392 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2393, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_1803 = torch.constant.int 32
    %int2_1804 = torch.constant.int 2
    %int32_1805 = torch.constant.int 32
    %int8_1806 = torch.constant.int 8
    %int128_1807 = torch.constant.int 128
    %2394 = torch.prim.ListConstruct %776, %int32_1803, %int2_1804, %int32_1805, %int8_1806, %int128_1807 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2395 = torch.aten.view %2088, %2394 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2395, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1808 = torch.constant.int 32
    %int8_1809 = torch.constant.int 8
    %int128_1810 = torch.constant.int 128
    %2396 = torch.prim.ListConstruct %990, %int32_1808, %int8_1809, %int128_1810 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2397 = torch.aten.view %2395, %2396 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2397, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1811 = torch.constant.int 1
    %2398 = torch.aten.view.dtype %2397, %int1_1811 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2398, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2399 = torch.aten.detach %2398 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2399, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2400 = torch.aten.detach %2399 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2400, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2401 = torch.prim.ListConstruct %2389 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1812 = torch.constant.bool false
    %2402 = torch.aten.index_put %2400, %2401, %2393, %false_1812 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2402, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_1813 = torch.constant.int 26
    %2403 = torch.aten.view.dtype %2402, %int26_1813 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2403, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2404 = torch.aten.detach %2403 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2404, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2405 = torch.aten.detach %2404 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2405, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1814 = torch.constant.int 32
    %int2_1815 = torch.constant.int 2
    %int32_1816 = torch.constant.int 32
    %int8_1817 = torch.constant.int 8
    %int128_1818 = torch.constant.int 128
    %2406 = torch.prim.ListConstruct %776, %int32_1814, %int2_1815, %int32_1816, %int8_1817, %int128_1818 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2407 = torch.aten.view %2405, %2406 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2407, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1819 = torch.constant.int 2097152
    %2408 = torch.prim.ListConstruct %776, %int2097152_1819 : (!torch.int, !torch.int) -> !torch.list<int>
    %2409 = torch.aten.view %2407, %2408 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2409, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int4_1820 = torch.constant.int 4
    %int32_1821 = torch.constant.int 32
    %int8_1822 = torch.constant.int 8
    %int128_1823 = torch.constant.int 128
    %2410 = torch.prim.ListConstruct %int4_1820, %775, %int32_1821, %int8_1822, %int128_1823 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2411 = torch.aten.view %2255, %2410 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2411, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_1824 = torch.constant.int 32
    %int8_1825 = torch.constant.int 8
    %int128_1826 = torch.constant.int 128
    %2412 = torch.prim.ListConstruct %997, %int32_1824, %int8_1825, %int128_1826 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2413 = torch.aten.view %2411, %2412 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2413, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1827 = torch.constant.int 1
    %int1_1828 = torch.constant.int 1
    %2414 = torch.aten.add.Scalar %2383, %int1_1827, %int1_1828 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2414, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %2415 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %2416 = torch.aten.view %2414, %2415 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2416, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_1829 = torch.constant.int 26
    %2417 = torch.prims.convert_element_type %2413, %int26_1829 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2417, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1830 = torch.constant.int 1
    %2418 = torch.aten.view.dtype %2417, %int1_1830 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2418, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2419 = torch.aten.detach %2418 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2419, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2420 = torch.aten.detach %2419 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2420, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_1831 = torch.constant.int 32
    %int2_1832 = torch.constant.int 2
    %int32_1833 = torch.constant.int 32
    %int8_1834 = torch.constant.int 8
    %int128_1835 = torch.constant.int 128
    %2421 = torch.prim.ListConstruct %776, %int32_1831, %int2_1832, %int32_1833, %int8_1834, %int128_1835 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2422 = torch.aten.view %2409, %2421 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2422, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1836 = torch.constant.int 32
    %int8_1837 = torch.constant.int 8
    %int128_1838 = torch.constant.int 128
    %2423 = torch.prim.ListConstruct %990, %int32_1836, %int8_1837, %int128_1838 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2424 = torch.aten.view %2422, %2423 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2424, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1839 = torch.constant.int 1
    %2425 = torch.aten.view.dtype %2424, %int1_1839 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2425, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2426 = torch.aten.detach %2425 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2426, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2427 = torch.aten.detach %2426 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2427, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2428 = torch.prim.ListConstruct %2416 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1840 = torch.constant.bool false
    %2429 = torch.aten.index_put %2427, %2428, %2420, %false_1840 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2429, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_1841 = torch.constant.int 26
    %2430 = torch.aten.view.dtype %2429, %int26_1841 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2430, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2431 = torch.aten.detach %2430 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2431, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2432 = torch.aten.detach %2431 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2432, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1842 = torch.constant.int 32
    %int2_1843 = torch.constant.int 2
    %int32_1844 = torch.constant.int 32
    %int8_1845 = torch.constant.int 8
    %int128_1846 = torch.constant.int 128
    %2433 = torch.prim.ListConstruct %776, %int32_1842, %int2_1843, %int32_1844, %int8_1845, %int128_1846 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2434 = torch.aten.view %2432, %2433 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2434, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1847 = torch.constant.int 2097152
    %2435 = torch.prim.ListConstruct %776, %int2097152_1847 : (!torch.int, !torch.int) -> !torch.list<int>
    %2436 = torch.aten.view %2434, %2435 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2436, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_1848 = torch.constant.int -2
    %2437 = torch.aten.unsqueeze %2381, %int-2_1848 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2437, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_1849 = torch.constant.int 4
    %int8_1850 = torch.constant.int 8
    %int4_1851 = torch.constant.int 4
    %int128_1852 = torch.constant.int 128
    %2438 = torch.prim.ListConstruct %int4_1849, %777, %int8_1850, %int4_1851, %int128_1852 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1853 = torch.constant.bool false
    %2439 = torch.aten.expand %2437, %2438, %false_1853 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2439, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_1854 = torch.constant.int 0
    %2440 = torch.aten.clone %2439, %int0_1854 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2440, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_1855 = torch.constant.int 4
    %int32_1856 = torch.constant.int 32
    %int128_1857 = torch.constant.int 128
    %2441 = torch.prim.ListConstruct %int4_1855, %777, %int32_1856, %int128_1857 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2442 = torch.aten._unsafe_view %2440, %2441 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2442, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int-2_1858 = torch.constant.int -2
    %2443 = torch.aten.unsqueeze %2255, %int-2_1858 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2443, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_1859 = torch.constant.int 4
    %int8_1860 = torch.constant.int 8
    %int4_1861 = torch.constant.int 4
    %int128_1862 = torch.constant.int 128
    %2444 = torch.prim.ListConstruct %int4_1859, %777, %int8_1860, %int4_1861, %int128_1862 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1863 = torch.constant.bool false
    %2445 = torch.aten.expand %2443, %2444, %false_1863 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2445, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_1864 = torch.constant.int 0
    %2446 = torch.aten.clone %2445, %int0_1864 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2446, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_1865 = torch.constant.int 4
    %int32_1866 = torch.constant.int 32
    %int128_1867 = torch.constant.int 128
    %2447 = torch.prim.ListConstruct %int4_1865, %777, %int32_1866, %int128_1867 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2448 = torch.aten._unsafe_view %2446, %2447 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2448, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_1868 = torch.constant.int 1
    %int2_1869 = torch.constant.int 2
    %2449 = torch.aten.transpose.int %2318, %int1_1868, %int2_1869 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2449, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_1870 = torch.constant.int 1
    %int2_1871 = torch.constant.int 2
    %2450 = torch.aten.transpose.int %2442, %int1_1870, %int2_1871 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2450, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_1872 = torch.constant.int 1
    %int2_1873 = torch.constant.int 2
    %2451 = torch.aten.transpose.int %2448, %int1_1872, %int2_1873 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2451, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1874 = torch.constant.int 26
    %2452 = torch.prims.convert_element_type %2449, %int26_1874 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2452, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1875 = torch.constant.int 26
    %2453 = torch.prims.convert_element_type %2450, %int26_1875 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2453, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1876 = torch.constant.int 26
    %2454 = torch.prims.convert_element_type %2451, %int26_1876 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2454, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_1877 = torch.constant.int 26
    %2455 = torch.prims.convert_element_type %803, %int26_1877 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2455, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    %int0_1878 = torch.constant.int 0
    %int0_1879 = torch.constant.int 0
    %2456 = torch.aten.select.int %2455, %int0_1878, %int0_1879 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2456, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    %int0_1880 = torch.constant.int 0
    %int0_1881 = torch.constant.int 0
    %2457 = torch.aten.select.int %2456, %int0_1880, %int0_1881 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2457, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int0_1882 = torch.constant.int 0
    %int0_1883 = torch.constant.int 0
    %int9223372036854775807_1884 = torch.constant.int 9223372036854775807
    %int1_1885 = torch.constant.int 1
    %2458 = torch.aten.slice.Tensor %2457, %int0_1882, %int0_1883, %int9223372036854775807_1884, %int1_1885 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2458, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int1_1886 = torch.constant.int 1
    %int0_1887 = torch.constant.int 0
    %int9223372036854775807_1888 = torch.constant.int 9223372036854775807
    %int1_1889 = torch.constant.int 1
    %2459 = torch.aten.slice.Tensor %2458, %int1_1886, %int0_1887, %int9223372036854775807_1888, %int1_1889 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2459, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %none_1890 = torch.constant.none
    %2460 = torch.aten.clone %107, %none_1890 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
    %2461 = torch.aten.detach %2460 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2462 = torch.aten.detach %2461 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2463 = torch.aten.detach %2462 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2464 = torch_c.to_builtin_tensor %2452 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2465 = torch_c.to_builtin_tensor %2453 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2466 = torch_c.to_builtin_tensor %2454 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2467 = torch_c.to_builtin_tensor %2459 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ>
    %2468 = torch_c.to_builtin_tensor %2463 : !torch.vtensor<[],f32> -> tensor<f32>
    %2469 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%2464, %2465, %2466, %2468, %2467) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %2470 = torch_c.from_builtin_tensor %2469 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32>
    torch.bind_symbolic_shape %2470, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32>
    %int1_1891 = torch.constant.int 1
    %int2_1892 = torch.constant.int 2
    %2471 = torch.aten.transpose.int %2470, %int1_1891, %int2_1892 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %2471, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int0_1893 = torch.constant.int 0
    %2472 = torch.aten.clone %2471, %int0_1893 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %2472, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int4_1894 = torch.constant.int 4
    %int4096_1895 = torch.constant.int 4096
    %2473 = torch.prim.ListConstruct %int4_1894, %777, %int4096_1895 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2474 = torch.aten._unsafe_view %2472, %2473 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2474, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2475 = torch.aten.div.Tensor %2474, %108 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2475, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1896 = torch.constant.float -2.400000e+02
    %float2.400000e02_1897 = torch.constant.float 2.400000e+02
    %2476 = torch.aten.clamp %2475, %float-2.400000e02_1896, %float2.400000e02_1897 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2476, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1898 = torch.constant.int 26
    %2477 = torch.prims.convert_element_type %2476, %int26_1898 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2477, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1899 = torch.constant.int 0
    %2478 = torch.aten.unsqueeze %109, %int0_1899 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_1900 = torch.constant.int 4
    %int4096_1901 = torch.constant.int 4096
    %int4096_1902 = torch.constant.int 4096
    %2479 = torch.prim.ListConstruct %int4_1900, %int4096_1901, %int4096_1902 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1903 = torch.constant.bool false
    %2480 = torch.aten.expand %2478, %2479, %false_1903 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2481 = torch_c.to_builtin_tensor %2477 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2482 = torch_c.to_builtin_tensor %2480 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2483 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2481, %2482) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2484 = torch_c.from_builtin_tensor %2483 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2484, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2485 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2486 = torch.aten.permute %110, %2485 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2487 = torch.aten.mul.Tensor %108, %2486 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1904 = torch.constant.int 6
    %2488 = torch.prims.convert_element_type %2484, %int6_1904 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2488, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2489 = torch.aten.mul.Tensor %2488, %2487 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2489, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_1905 = torch.constant.int 1
    %2490 = torch.aten.add.Tensor %2200, %2489, %int1_1905 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2490, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1906 = torch.constant.int 6
    %2491 = torch.prims.convert_element_type %2490, %int6_1906 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2491, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_1907 = torch.constant.int 2
    %2492 = torch.aten.pow.Tensor_Scalar %2491, %int2_1907 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2492, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_1908 = torch.constant.int -1
    %2493 = torch.prim.ListConstruct %int-1_1908 : (!torch.int) -> !torch.list<int>
    %true_1909 = torch.constant.bool true
    %none_1910 = torch.constant.none
    %2494 = torch.aten.mean.dim %2492, %2493, %true_1909, %none_1910 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2494, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_1911 = torch.constant.float 1.000000e-05
    %int1_1912 = torch.constant.int 1
    %2495 = torch.aten.add.Scalar %2494, %float1.000000e-05_1911, %int1_1912 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2495, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2496 = torch.aten.rsqrt %2495 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2496, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2497 = torch.aten.mul.Tensor %2491, %2496 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2497, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1913 = torch.constant.int 6
    %2498 = torch.prims.convert_element_type %2497, %int6_1913 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2498, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2499 = torch.aten.mul.Tensor %111, %2498 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2499, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1914 = torch.constant.int 6
    %2500 = torch.prims.convert_element_type %2499, %int6_1914 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2500, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2501 = torch.aten.div.Tensor %2500, %112 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2501, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1915 = torch.constant.float -2.400000e+02
    %float2.400000e02_1916 = torch.constant.float 2.400000e+02
    %2502 = torch.aten.clamp %2501, %float-2.400000e02_1915, %float2.400000e02_1916 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2502, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1917 = torch.constant.int 26
    %2503 = torch.prims.convert_element_type %2502, %int26_1917 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2503, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1918 = torch.constant.int 0
    %2504 = torch.aten.unsqueeze %113, %int0_1918 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_1919 = torch.constant.int 4
    %int14336_1920 = torch.constant.int 14336
    %int4096_1921 = torch.constant.int 4096
    %2505 = torch.prim.ListConstruct %int4_1919, %int14336_1920, %int4096_1921 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1922 = torch.constant.bool false
    %2506 = torch.aten.expand %2504, %2505, %false_1922 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2507 = torch_c.to_builtin_tensor %2503 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2508 = torch_c.to_builtin_tensor %2506 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2509 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2507, %2508) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %2510 = torch_c.from_builtin_tensor %2509 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2510, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2511 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2512 = torch.aten.permute %114, %2511 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2513 = torch.aten.mul.Tensor %112, %2512 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1923 = torch.constant.int 6
    %2514 = torch.prims.convert_element_type %2510, %int6_1923 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2514, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2515 = torch.aten.mul.Tensor %2514, %2513 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2515, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2516 = torch.aten.silu %2515 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2516, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2517 = torch.aten.div.Tensor %2500, %115 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2517, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1924 = torch.constant.float -2.400000e+02
    %float2.400000e02_1925 = torch.constant.float 2.400000e+02
    %2518 = torch.aten.clamp %2517, %float-2.400000e02_1924, %float2.400000e02_1925 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2518, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1926 = torch.constant.int 26
    %2519 = torch.prims.convert_element_type %2518, %int26_1926 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2519, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1927 = torch.constant.int 0
    %2520 = torch.aten.unsqueeze %116, %int0_1927 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_1928 = torch.constant.int 4
    %int14336_1929 = torch.constant.int 14336
    %int4096_1930 = torch.constant.int 4096
    %2521 = torch.prim.ListConstruct %int4_1928, %int14336_1929, %int4096_1930 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1931 = torch.constant.bool false
    %2522 = torch.aten.expand %2520, %2521, %false_1931 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2523 = torch_c.to_builtin_tensor %2519 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2524 = torch_c.to_builtin_tensor %2522 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2525 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2523, %2524) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %2526 = torch_c.from_builtin_tensor %2525 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2526, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2527 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2528 = torch.aten.permute %117, %2527 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2529 = torch.aten.mul.Tensor %115, %2528 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1932 = torch.constant.int 6
    %2530 = torch.prims.convert_element_type %2526, %int6_1932 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2530, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2531 = torch.aten.mul.Tensor %2530, %2529 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2531, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2532 = torch.aten.mul.Tensor %2516, %2531 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2532, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2533 = torch.aten.div.Tensor %2532, %118 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2533, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %float-2.400000e02_1933 = torch.constant.float -2.400000e+02
    %float2.400000e02_1934 = torch.constant.float 2.400000e+02
    %2534 = torch.aten.clamp %2533, %float-2.400000e02_1933, %float2.400000e02_1934 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2534, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %int26_1935 = torch.constant.int 26
    %2535 = torch.prims.convert_element_type %2534, %int26_1935 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2535, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    %int0_1936 = torch.constant.int 0
    %2536 = torch.aten.unsqueeze %119, %int0_1936 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %int4_1937 = torch.constant.int 4
    %int4096_1938 = torch.constant.int 4096
    %int14336_1939 = torch.constant.int 14336
    %2537 = torch.prim.ListConstruct %int4_1937, %int4096_1938, %int14336_1939 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1940 = torch.constant.bool false
    %2538 = torch.aten.expand %2536, %2537, %false_1940 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %2539 = torch_c.to_builtin_tensor %2535 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ>
    %2540 = torch_c.to_builtin_tensor %2538 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ>
    %2541 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%2539, %2540) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2542 = torch_c.from_builtin_tensor %2541 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2542, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2543 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2544 = torch.aten.permute %120, %2543 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2545 = torch.aten.mul.Tensor %118, %2544 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_1941 = torch.constant.int 6
    %2546 = torch.prims.convert_element_type %2542, %int6_1941 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2546, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2547 = torch.aten.mul.Tensor %2546, %2545 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2547, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_1942 = torch.constant.int 1
    %2548 = torch.aten.add.Tensor %2490, %2547, %int1_1942 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2548, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1943 = torch.constant.int 6
    %2549 = torch.prims.convert_element_type %2548, %int6_1943 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2549, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_1944 = torch.constant.int 2
    %2550 = torch.aten.pow.Tensor_Scalar %2549, %int2_1944 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2550, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_1945 = torch.constant.int -1
    %2551 = torch.prim.ListConstruct %int-1_1945 : (!torch.int) -> !torch.list<int>
    %true_1946 = torch.constant.bool true
    %none_1947 = torch.constant.none
    %2552 = torch.aten.mean.dim %2550, %2551, %true_1946, %none_1947 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2552, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_1948 = torch.constant.float 1.000000e-05
    %int1_1949 = torch.constant.int 1
    %2553 = torch.aten.add.Scalar %2552, %float1.000000e-05_1948, %int1_1949 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2553, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2554 = torch.aten.rsqrt %2553 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2554, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2555 = torch.aten.mul.Tensor %2549, %2554 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2555, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1950 = torch.constant.int 6
    %2556 = torch.prims.convert_element_type %2555, %int6_1950 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2556, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2557 = torch.aten.mul.Tensor %121, %2556 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2557, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_1951 = torch.constant.int 6
    %2558 = torch.prims.convert_element_type %2557, %int6_1951 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2558, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2559 = torch.aten.div.Tensor %2558, %122 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2559, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1952 = torch.constant.float -2.400000e+02
    %float2.400000e02_1953 = torch.constant.float 2.400000e+02
    %2560 = torch.aten.clamp %2559, %float-2.400000e02_1952, %float2.400000e02_1953 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2560, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1954 = torch.constant.int 26
    %2561 = torch.prims.convert_element_type %2560, %int26_1954 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2561, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1955 = torch.constant.int 0
    %2562 = torch.aten.unsqueeze %123, %int0_1955 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_1956 = torch.constant.int 4
    %int4096_1957 = torch.constant.int 4096
    %int4096_1958 = torch.constant.int 4096
    %2563 = torch.prim.ListConstruct %int4_1956, %int4096_1957, %int4096_1958 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1959 = torch.constant.bool false
    %2564 = torch.aten.expand %2562, %2563, %false_1959 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2565 = torch_c.to_builtin_tensor %2561 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2566 = torch_c.to_builtin_tensor %2564 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2567 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2565, %2566) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2568 = torch_c.from_builtin_tensor %2567 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2568, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2569 = torch.aten.div.Tensor %2568, %124 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2569, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1960 = torch.constant.float -2.400000e+02
    %float2.400000e02_1961 = torch.constant.float 2.400000e+02
    %2570 = torch.aten.clamp %2569, %float-2.400000e02_1960, %float2.400000e02_1961 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2570, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1962 = torch.constant.int 26
    %2571 = torch.prims.convert_element_type %2570, %int26_1962 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2571, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %2572 = torch.aten.div.Tensor %2558, %125 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2572, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1963 = torch.constant.float -2.400000e+02
    %float2.400000e02_1964 = torch.constant.float 2.400000e+02
    %2573 = torch.aten.clamp %2572, %float-2.400000e02_1963, %float2.400000e02_1964 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2573, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1965 = torch.constant.int 26
    %2574 = torch.prims.convert_element_type %2573, %int26_1965 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2574, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1966 = torch.constant.int 0
    %2575 = torch.aten.unsqueeze %126, %int0_1966 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_1967 = torch.constant.int 4
    %int1024_1968 = torch.constant.int 1024
    %int4096_1969 = torch.constant.int 4096
    %2576 = torch.prim.ListConstruct %int4_1967, %int1024_1968, %int4096_1969 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1970 = torch.constant.bool false
    %2577 = torch.aten.expand %2575, %2576, %false_1970 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2578 = torch_c.to_builtin_tensor %2574 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2579 = torch_c.to_builtin_tensor %2577 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2580 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2578, %2579) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %2581 = torch_c.from_builtin_tensor %2580 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2581, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %2582 = torch.aten.div.Tensor %2581, %127 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2582, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_1971 = torch.constant.float -2.400000e+02
    %float2.400000e02_1972 = torch.constant.float 2.400000e+02
    %2583 = torch.aten.clamp %2582, %float-2.400000e02_1971, %float2.400000e02_1972 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2583, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_1973 = torch.constant.int 26
    %2584 = torch.prims.convert_element_type %2583, %int26_1973 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2584, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %2585 = torch.aten.div.Tensor %2558, %128 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2585, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_1974 = torch.constant.float -2.400000e+02
    %float2.400000e02_1975 = torch.constant.float 2.400000e+02
    %2586 = torch.aten.clamp %2585, %float-2.400000e02_1974, %float2.400000e02_1975 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2586, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_1976 = torch.constant.int 26
    %2587 = torch.prims.convert_element_type %2586, %int26_1976 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2587, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_1977 = torch.constant.int 0
    %2588 = torch.aten.unsqueeze %129, %int0_1977 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_1978 = torch.constant.int 4
    %int1024_1979 = torch.constant.int 1024
    %int4096_1980 = torch.constant.int 4096
    %2589 = torch.prim.ListConstruct %int4_1978, %int1024_1979, %int4096_1980 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1981 = torch.constant.bool false
    %2590 = torch.aten.expand %2588, %2589, %false_1981 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2591 = torch_c.to_builtin_tensor %2587 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2592 = torch_c.to_builtin_tensor %2590 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2593 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2591, %2592) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %2594 = torch_c.from_builtin_tensor %2593 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2594, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %2595 = torch.aten.div.Tensor %2594, %130 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2595, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_1982 = torch.constant.float -2.400000e+02
    %float2.400000e02_1983 = torch.constant.float 2.400000e+02
    %2596 = torch.aten.clamp %2595, %float-2.400000e02_1982, %float2.400000e02_1983 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2596, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_1984 = torch.constant.int 26
    %2597 = torch.prims.convert_element_type %2596, %int26_1984 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2597, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %int4_1985 = torch.constant.int 4
    %int32_1986 = torch.constant.int 32
    %int128_1987 = torch.constant.int 128
    %2598 = torch.prim.ListConstruct %int4_1985, %777, %int32_1986, %int128_1987 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2599 = torch.aten.view %2571, %2598 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2599, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int4_1988 = torch.constant.int 4
    %int8_1989 = torch.constant.int 8
    %int128_1990 = torch.constant.int 128
    %2600 = torch.prim.ListConstruct %int4_1988, %777, %int8_1989, %int128_1990 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2601 = torch.aten.view %2584, %2600 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2601, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int4_1991 = torch.constant.int 4
    %int8_1992 = torch.constant.int 8
    %int128_1993 = torch.constant.int 128
    %2602 = torch.prim.ListConstruct %int4_1991, %777, %int8_1992, %int128_1993 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2603 = torch.aten.view %2597, %2602 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2603, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int131072_1994 = torch.constant.int 131072
    %none_1995 = torch.constant.none
    %none_1996 = torch.constant.none
    %cpu_1997 = torch.constant.device "cpu"
    %false_1998 = torch.constant.bool false
    %2604 = torch.aten.arange %int131072_1994, %none_1995, %none_1996, %cpu_1997, %false_1998 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1999 = torch.constant.int 0
    %int128_2000 = torch.constant.int 128
    %int2_2001 = torch.constant.int 2
    %int4_2002 = torch.constant.int 4
    %none_2003 = torch.constant.none
    %cpu_2004 = torch.constant.device "cpu"
    %false_2005 = torch.constant.bool false
    %2605 = torch.aten.arange.start_step %int0_1999, %int128_2000, %int2_2001, %int4_2002, %none_2003, %cpu_2004, %false_2005 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2006 = torch.constant.int 6
    %2606 = torch.prims.convert_element_type %2605, %int6_2006 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2007 = torch.constant.int 128
    %2607 = torch.aten.div.Scalar %2606, %int128_2007 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2008 = torch.constant.float 5.000000e+05
    %2608 = torch.aten.pow.Scalar %float5.000000e05_2008, %2607 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2609 = torch.aten.reciprocal %2608 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2009 = torch.constant.float 1.000000e+00
    %2610 = torch.aten.mul.Scalar %2609, %float1.000000e00_2009 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2611 = torch.aten.reciprocal %2610 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2010 = torch.constant.float 6.2831853071795862
    %2612 = torch.aten.mul.Scalar %2611, %float6.283190e00_2010 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2011 = torch.constant.float 8.192000e+03
    %2613 = torch.aten.gt.Scalar %2612, %float8.192000e03_2011 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2012 = torch.constant.int 8
    %2614 = torch.aten.div.Scalar %2610, %int8_2012 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2615 = torch.aten.where.self %2613, %2614, %2610 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2616 = torch.aten.reciprocal %2612 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2013 = torch.constant.int 8192
    %2617 = torch.aten.mul.Scalar %2616, %int8192_2013 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2014 = torch.constant.int 1
    %int1_2015 = torch.constant.int 1
    %2618 = torch.aten.sub.Scalar %2617, %int1_2014, %int1_2015 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2016 = torch.constant.int 3
    %2619 = torch.aten.div.Scalar %2618, %int3_2016 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2017 = torch.constant.int 1
    %int1_2018 = torch.constant.int 1
    %2620 = torch.aten.rsub.Scalar %2619, %int1_2017, %int1_2018 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2621 = torch.aten.mul.Tensor %2620, %2615 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2019 = torch.constant.int 8
    %2622 = torch.aten.div.Scalar %2621, %int8_2019 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2623 = torch.aten.mul.Tensor %2619, %2615 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2020 = torch.constant.int 1
    %2624 = torch.aten.add.Tensor %2622, %2623, %int1_2020 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2021 = torch.constant.float 2.048000e+03
    %2625 = torch.aten.lt.Scalar %2612, %float2.048000e03_2021 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2626 = torch.aten.bitwise_not %2625 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2022 = torch.constant.float 8.192000e+03
    %2627 = torch.aten.gt.Scalar %2612, %float8.192000e03_2022 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2628 = torch.aten.bitwise_not %2627 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2629 = torch.aten.mul.Tensor %2626, %2628 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2630 = torch.aten.where.self %2629, %2624, %2615 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2631 = torch.prim.ListConstruct %2630, %2630 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2023 = torch.constant.int -1
    %2632 = torch.aten.cat %2631, %int-1_2023 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2024 = torch.constant.int 6
    %2633 = torch.prims.convert_element_type %2632, %int6_2024 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_2025 = torch.constant.int 1
    %2634 = torch.aten.unsqueeze %2604, %int1_2025 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_2026 = torch.constant.int 6
    %2635 = torch.prims.convert_element_type %2634, %int6_2026 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_2027 = torch.constant.int 0
    %2636 = torch.aten.unsqueeze %2633, %int0_2027 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_2028 = torch.constant.int 6
    %2637 = torch.prims.convert_element_type %2636, %int6_2028 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %2638 = torch.aten.mul.Tensor %2635, %2637 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %2639 = torch.aten.cos %2638 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2029 = torch.constant.int 15
    %2640 = torch.prims.convert_element_type %2639, %int15_2029 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2641 = torch.aten.sin %2638 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2030 = torch.constant.int 15
    %2642 = torch.prims.convert_element_type %2641, %int15_2030 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_2031 = torch.constant.int 0
    %int0_2032 = torch.constant.int 0
    %int1_2033 = torch.constant.int 1
    %2643 = torch.aten.slice.Tensor %2640, %int0_2031, %int0_2032, %777, %int1_2033 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2643, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2034 = torch.constant.int 1
    %int0_2035 = torch.constant.int 0
    %int9223372036854775807_2036 = torch.constant.int 9223372036854775807
    %int1_2037 = torch.constant.int 1
    %2644 = torch.aten.slice.Tensor %2643, %int1_2034, %int0_2035, %int9223372036854775807_2036, %int1_2037 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2644, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2038 = torch.constant.int 0
    %int0_2039 = torch.constant.int 0
    %int1_2040 = torch.constant.int 1
    %2645 = torch.aten.slice.Tensor %2642, %int0_2038, %int0_2039, %777, %int1_2040 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2645, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2041 = torch.constant.int 1
    %int0_2042 = torch.constant.int 0
    %int9223372036854775807_2043 = torch.constant.int 9223372036854775807
    %int1_2044 = torch.constant.int 1
    %2646 = torch.aten.slice.Tensor %2645, %int1_2041, %int0_2042, %int9223372036854775807_2043, %int1_2044 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2646, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2045 = torch.constant.int 0
    %2647 = torch.aten.unsqueeze %2644, %int0_2045 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2647, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2046 = torch.constant.int 1
    %int0_2047 = torch.constant.int 0
    %int9223372036854775807_2048 = torch.constant.int 9223372036854775807
    %int1_2049 = torch.constant.int 1
    %2648 = torch.aten.slice.Tensor %2647, %int1_2046, %int0_2047, %int9223372036854775807_2048, %int1_2049 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2648, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2050 = torch.constant.int 2
    %2649 = torch.aten.unsqueeze %2648, %int2_2050 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2649, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2051 = torch.constant.int 3
    %int0_2052 = torch.constant.int 0
    %int9223372036854775807_2053 = torch.constant.int 9223372036854775807
    %int1_2054 = torch.constant.int 1
    %2650 = torch.aten.slice.Tensor %2649, %int3_2051, %int0_2052, %int9223372036854775807_2053, %int1_2054 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2650, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2055 = torch.constant.int 4
    %int1_2056 = torch.constant.int 1
    %int1_2057 = torch.constant.int 1
    %int1_2058 = torch.constant.int 1
    %2651 = torch.prim.ListConstruct %int4_2055, %int1_2056, %int1_2057, %int1_2058 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2652 = torch.aten.repeat %2650, %2651 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2652, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_2059 = torch.constant.int 0
    %2653 = torch.aten.unsqueeze %2646, %int0_2059 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2653, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2060 = torch.constant.int 1
    %int0_2061 = torch.constant.int 0
    %int9223372036854775807_2062 = torch.constant.int 9223372036854775807
    %int1_2063 = torch.constant.int 1
    %2654 = torch.aten.slice.Tensor %2653, %int1_2060, %int0_2061, %int9223372036854775807_2062, %int1_2063 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2654, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2064 = torch.constant.int 2
    %2655 = torch.aten.unsqueeze %2654, %int2_2064 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2655, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2065 = torch.constant.int 3
    %int0_2066 = torch.constant.int 0
    %int9223372036854775807_2067 = torch.constant.int 9223372036854775807
    %int1_2068 = torch.constant.int 1
    %2656 = torch.aten.slice.Tensor %2655, %int3_2065, %int0_2066, %int9223372036854775807_2067, %int1_2068 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2656, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2069 = torch.constant.int 4
    %int1_2070 = torch.constant.int 1
    %int1_2071 = torch.constant.int 1
    %int1_2072 = torch.constant.int 1
    %2657 = torch.prim.ListConstruct %int4_2069, %int1_2070, %int1_2071, %int1_2072 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2658 = torch.aten.repeat %2656, %2657 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2658, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %2659 = torch.aten.mul.Tensor %2599, %2652 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2659, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int3_2073 = torch.constant.int 3
    %int0_2074 = torch.constant.int 0
    %int64_2075 = torch.constant.int 64
    %int1_2076 = torch.constant.int 1
    %2660 = torch.aten.slice.Tensor %2599, %int3_2073, %int0_2074, %int64_2075, %int1_2076 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2660, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %int3_2077 = torch.constant.int 3
    %int64_2078 = torch.constant.int 64
    %int9223372036854775807_2079 = torch.constant.int 9223372036854775807
    %int1_2080 = torch.constant.int 1
    %2661 = torch.aten.slice.Tensor %2599, %int3_2077, %int64_2078, %int9223372036854775807_2079, %int1_2080 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2661, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %2662 = torch.aten.neg %2661 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2662, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %2663 = torch.prim.ListConstruct %2662, %2660 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_2081 = torch.constant.int -1
    %2664 = torch.aten.cat %2663, %int-1_2081 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2664, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %2665 = torch.aten.mul.Tensor %2664, %2658 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2665, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_2082 = torch.constant.int 1
    %2666 = torch.aten.add.Tensor %2659, %2665, %int1_2082 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2666, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int131072_2083 = torch.constant.int 131072
    %none_2084 = torch.constant.none
    %none_2085 = torch.constant.none
    %cpu_2086 = torch.constant.device "cpu"
    %false_2087 = torch.constant.bool false
    %2667 = torch.aten.arange %int131072_2083, %none_2084, %none_2085, %cpu_2086, %false_2087 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2088 = torch.constant.int 0
    %int128_2089 = torch.constant.int 128
    %int2_2090 = torch.constant.int 2
    %int4_2091 = torch.constant.int 4
    %none_2092 = torch.constant.none
    %cpu_2093 = torch.constant.device "cpu"
    %false_2094 = torch.constant.bool false
    %2668 = torch.aten.arange.start_step %int0_2088, %int128_2089, %int2_2090, %int4_2091, %none_2092, %cpu_2093, %false_2094 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2095 = torch.constant.int 6
    %2669 = torch.prims.convert_element_type %2668, %int6_2095 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2096 = torch.constant.int 128
    %2670 = torch.aten.div.Scalar %2669, %int128_2096 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2097 = torch.constant.float 5.000000e+05
    %2671 = torch.aten.pow.Scalar %float5.000000e05_2097, %2670 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2672 = torch.aten.reciprocal %2671 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2098 = torch.constant.float 1.000000e+00
    %2673 = torch.aten.mul.Scalar %2672, %float1.000000e00_2098 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2674 = torch.aten.reciprocal %2673 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2099 = torch.constant.float 6.2831853071795862
    %2675 = torch.aten.mul.Scalar %2674, %float6.283190e00_2099 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2100 = torch.constant.float 8.192000e+03
    %2676 = torch.aten.gt.Scalar %2675, %float8.192000e03_2100 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2101 = torch.constant.int 8
    %2677 = torch.aten.div.Scalar %2673, %int8_2101 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2678 = torch.aten.where.self %2676, %2677, %2673 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2679 = torch.aten.reciprocal %2675 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2102 = torch.constant.int 8192
    %2680 = torch.aten.mul.Scalar %2679, %int8192_2102 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2103 = torch.constant.int 1
    %int1_2104 = torch.constant.int 1
    %2681 = torch.aten.sub.Scalar %2680, %int1_2103, %int1_2104 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2105 = torch.constant.int 3
    %2682 = torch.aten.div.Scalar %2681, %int3_2105 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2106 = torch.constant.int 1
    %int1_2107 = torch.constant.int 1
    %2683 = torch.aten.rsub.Scalar %2682, %int1_2106, %int1_2107 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2684 = torch.aten.mul.Tensor %2683, %2678 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2108 = torch.constant.int 8
    %2685 = torch.aten.div.Scalar %2684, %int8_2108 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2686 = torch.aten.mul.Tensor %2682, %2678 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2109 = torch.constant.int 1
    %2687 = torch.aten.add.Tensor %2685, %2686, %int1_2109 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2110 = torch.constant.float 2.048000e+03
    %2688 = torch.aten.lt.Scalar %2675, %float2.048000e03_2110 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2689 = torch.aten.bitwise_not %2688 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2111 = torch.constant.float 8.192000e+03
    %2690 = torch.aten.gt.Scalar %2675, %float8.192000e03_2111 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2691 = torch.aten.bitwise_not %2690 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2692 = torch.aten.mul.Tensor %2689, %2691 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2693 = torch.aten.where.self %2692, %2687, %2678 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2694 = torch.prim.ListConstruct %2693, %2693 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2112 = torch.constant.int -1
    %2695 = torch.aten.cat %2694, %int-1_2112 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2113 = torch.constant.int 6
    %2696 = torch.prims.convert_element_type %2695, %int6_2113 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_2114 = torch.constant.int 1
    %2697 = torch.aten.unsqueeze %2667, %int1_2114 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_2115 = torch.constant.int 6
    %2698 = torch.prims.convert_element_type %2697, %int6_2115 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_2116 = torch.constant.int 0
    %2699 = torch.aten.unsqueeze %2696, %int0_2116 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_2117 = torch.constant.int 6
    %2700 = torch.prims.convert_element_type %2699, %int6_2117 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %2701 = torch.aten.mul.Tensor %2698, %2700 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %2702 = torch.aten.cos %2701 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2118 = torch.constant.int 15
    %2703 = torch.prims.convert_element_type %2702, %int15_2118 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2704 = torch.aten.sin %2701 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2119 = torch.constant.int 15
    %2705 = torch.prims.convert_element_type %2704, %int15_2119 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_2120 = torch.constant.int 0
    %int0_2121 = torch.constant.int 0
    %int1_2122 = torch.constant.int 1
    %2706 = torch.aten.slice.Tensor %2703, %int0_2120, %int0_2121, %777, %int1_2122 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2706, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2123 = torch.constant.int 1
    %int0_2124 = torch.constant.int 0
    %int9223372036854775807_2125 = torch.constant.int 9223372036854775807
    %int1_2126 = torch.constant.int 1
    %2707 = torch.aten.slice.Tensor %2706, %int1_2123, %int0_2124, %int9223372036854775807_2125, %int1_2126 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2707, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2127 = torch.constant.int 0
    %int0_2128 = torch.constant.int 0
    %int1_2129 = torch.constant.int 1
    %2708 = torch.aten.slice.Tensor %2705, %int0_2127, %int0_2128, %777, %int1_2129 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2708, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2130 = torch.constant.int 1
    %int0_2131 = torch.constant.int 0
    %int9223372036854775807_2132 = torch.constant.int 9223372036854775807
    %int1_2133 = torch.constant.int 1
    %2709 = torch.aten.slice.Tensor %2708, %int1_2130, %int0_2131, %int9223372036854775807_2132, %int1_2133 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2709, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2134 = torch.constant.int 0
    %2710 = torch.aten.unsqueeze %2707, %int0_2134 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2710, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2135 = torch.constant.int 1
    %int0_2136 = torch.constant.int 0
    %int9223372036854775807_2137 = torch.constant.int 9223372036854775807
    %int1_2138 = torch.constant.int 1
    %2711 = torch.aten.slice.Tensor %2710, %int1_2135, %int0_2136, %int9223372036854775807_2137, %int1_2138 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2711, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2139 = torch.constant.int 2
    %2712 = torch.aten.unsqueeze %2711, %int2_2139 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2712, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2140 = torch.constant.int 3
    %int0_2141 = torch.constant.int 0
    %int9223372036854775807_2142 = torch.constant.int 9223372036854775807
    %int1_2143 = torch.constant.int 1
    %2713 = torch.aten.slice.Tensor %2712, %int3_2140, %int0_2141, %int9223372036854775807_2142, %int1_2143 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2713, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2144 = torch.constant.int 4
    %int1_2145 = torch.constant.int 1
    %int1_2146 = torch.constant.int 1
    %int1_2147 = torch.constant.int 1
    %2714 = torch.prim.ListConstruct %int4_2144, %int1_2145, %int1_2146, %int1_2147 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2715 = torch.aten.repeat %2713, %2714 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2715, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_2148 = torch.constant.int 0
    %2716 = torch.aten.unsqueeze %2709, %int0_2148 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2716, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2149 = torch.constant.int 1
    %int0_2150 = torch.constant.int 0
    %int9223372036854775807_2151 = torch.constant.int 9223372036854775807
    %int1_2152 = torch.constant.int 1
    %2717 = torch.aten.slice.Tensor %2716, %int1_2149, %int0_2150, %int9223372036854775807_2151, %int1_2152 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2717, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2153 = torch.constant.int 2
    %2718 = torch.aten.unsqueeze %2717, %int2_2153 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2718, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2154 = torch.constant.int 3
    %int0_2155 = torch.constant.int 0
    %int9223372036854775807_2156 = torch.constant.int 9223372036854775807
    %int1_2157 = torch.constant.int 1
    %2719 = torch.aten.slice.Tensor %2718, %int3_2154, %int0_2155, %int9223372036854775807_2156, %int1_2157 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2719, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2158 = torch.constant.int 4
    %int1_2159 = torch.constant.int 1
    %int1_2160 = torch.constant.int 1
    %int1_2161 = torch.constant.int 1
    %2720 = torch.prim.ListConstruct %int4_2158, %int1_2159, %int1_2160, %int1_2161 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2721 = torch.aten.repeat %2719, %2720 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %2721, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %2722 = torch.aten.mul.Tensor %2601, %2715 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2722, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int3_2162 = torch.constant.int 3
    %int0_2163 = torch.constant.int 0
    %int64_2164 = torch.constant.int 64
    %int1_2165 = torch.constant.int 1
    %2723 = torch.aten.slice.Tensor %2601, %int3_2162, %int0_2163, %int64_2164, %int1_2165 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2723, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %int3_2166 = torch.constant.int 3
    %int64_2167 = torch.constant.int 64
    %int9223372036854775807_2168 = torch.constant.int 9223372036854775807
    %int1_2169 = torch.constant.int 1
    %2724 = torch.aten.slice.Tensor %2601, %int3_2166, %int64_2167, %int9223372036854775807_2168, %int1_2169 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2724, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %2725 = torch.aten.neg %2724 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2725, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %2726 = torch.prim.ListConstruct %2725, %2723 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_2170 = torch.constant.int -1
    %2727 = torch.aten.cat %2726, %int-1_2170 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2727, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %2728 = torch.aten.mul.Tensor %2727, %2721 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2728, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int1_2171 = torch.constant.int 1
    %2729 = torch.aten.add.Tensor %2722, %2728, %int1_2171 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2729, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int64_2172 = torch.constant.int 64
    %2730 = torch.aten.mul.Scalar %arg2, %int64_2172 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2730, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int10 = torch.constant.int 10
    %int1_2173 = torch.constant.int 1
    %2731 = torch.aten.add.Scalar %2730, %int10, %int1_2173 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2731, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_2174 = torch.constant.int 4
    %int32_2175 = torch.constant.int 32
    %int8_2176 = torch.constant.int 8
    %int128_2177 = torch.constant.int 128
    %2732 = torch.prim.ListConstruct %int4_2174, %775, %int32_2175, %int8_2176, %int128_2177 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2733 = torch.aten.view %2729, %2732 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2733, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_2178 = torch.constant.int 32
    %int8_2179 = torch.constant.int 8
    %int128_2180 = torch.constant.int 128
    %2734 = torch.prim.ListConstruct %997, %int32_2178, %int8_2179, %int128_2180 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2735 = torch.aten.view %2733, %2734 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2735, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2736 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %2737 = torch.aten.view %2731, %2736 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2737, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_2181 = torch.constant.int 26
    %2738 = torch.prims.convert_element_type %2735, %int26_2181 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2738, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2182 = torch.constant.int 1
    %2739 = torch.aten.view.dtype %2738, %int1_2182 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2739, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2740 = torch.aten.detach %2739 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2740, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2741 = torch.aten.detach %2740 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2741, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_2183 = torch.constant.int 32
    %int2_2184 = torch.constant.int 2
    %int32_2185 = torch.constant.int 32
    %int8_2186 = torch.constant.int 8
    %int128_2187 = torch.constant.int 128
    %2742 = torch.prim.ListConstruct %776, %int32_2183, %int2_2184, %int32_2185, %int8_2186, %int128_2187 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2743 = torch.aten.view %2436, %2742 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2743, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2188 = torch.constant.int 32
    %int8_2189 = torch.constant.int 8
    %int128_2190 = torch.constant.int 128
    %2744 = torch.prim.ListConstruct %990, %int32_2188, %int8_2189, %int128_2190 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2745 = torch.aten.view %2743, %2744 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2745, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2191 = torch.constant.int 1
    %2746 = torch.aten.view.dtype %2745, %int1_2191 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2746, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2747 = torch.aten.detach %2746 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2747, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2748 = torch.aten.detach %2747 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2748, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2749 = torch.prim.ListConstruct %2737 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2192 = torch.constant.bool false
    %2750 = torch.aten.index_put %2748, %2749, %2741, %false_2192 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2750, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_2193 = torch.constant.int 26
    %2751 = torch.aten.view.dtype %2750, %int26_2193 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2751, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2752 = torch.aten.detach %2751 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2752, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2753 = torch.aten.detach %2752 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2753, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2194 = torch.constant.int 32
    %int2_2195 = torch.constant.int 2
    %int32_2196 = torch.constant.int 32
    %int8_2197 = torch.constant.int 8
    %int128_2198 = torch.constant.int 128
    %2754 = torch.prim.ListConstruct %776, %int32_2194, %int2_2195, %int32_2196, %int8_2197, %int128_2198 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2755 = torch.aten.view %2753, %2754 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2755, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2199 = torch.constant.int 2097152
    %2756 = torch.prim.ListConstruct %776, %int2097152_2199 : (!torch.int, !torch.int) -> !torch.list<int>
    %2757 = torch.aten.view %2755, %2756 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2757, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int4_2200 = torch.constant.int 4
    %int32_2201 = torch.constant.int 32
    %int8_2202 = torch.constant.int 8
    %int128_2203 = torch.constant.int 128
    %2758 = torch.prim.ListConstruct %int4_2200, %775, %int32_2201, %int8_2202, %int128_2203 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2759 = torch.aten.view %2603, %2758 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2759, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_2204 = torch.constant.int 32
    %int8_2205 = torch.constant.int 8
    %int128_2206 = torch.constant.int 128
    %2760 = torch.prim.ListConstruct %997, %int32_2204, %int8_2205, %int128_2206 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2761 = torch.aten.view %2759, %2760 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2761, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2207 = torch.constant.int 1
    %int1_2208 = torch.constant.int 1
    %2762 = torch.aten.add.Scalar %2731, %int1_2207, %int1_2208 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %2762, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %2763 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %2764 = torch.aten.view %2762, %2763 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2764, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_2209 = torch.constant.int 26
    %2765 = torch.prims.convert_element_type %2761, %int26_2209 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2765, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2210 = torch.constant.int 1
    %2766 = torch.aten.view.dtype %2765, %int1_2210 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2766, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2767 = torch.aten.detach %2766 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2767, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2768 = torch.aten.detach %2767 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2768, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_2211 = torch.constant.int 32
    %int2_2212 = torch.constant.int 2
    %int32_2213 = torch.constant.int 32
    %int8_2214 = torch.constant.int 8
    %int128_2215 = torch.constant.int 128
    %2769 = torch.prim.ListConstruct %776, %int32_2211, %int2_2212, %int32_2213, %int8_2214, %int128_2215 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2770 = torch.aten.view %2757, %2769 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2770, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2216 = torch.constant.int 32
    %int8_2217 = torch.constant.int 8
    %int128_2218 = torch.constant.int 128
    %2771 = torch.prim.ListConstruct %990, %int32_2216, %int8_2217, %int128_2218 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2772 = torch.aten.view %2770, %2771 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2772, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2219 = torch.constant.int 1
    %2773 = torch.aten.view.dtype %2772, %int1_2219 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2773, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2774 = torch.aten.detach %2773 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2774, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2775 = torch.aten.detach %2774 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2775, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %2776 = torch.prim.ListConstruct %2764 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2220 = torch.constant.bool false
    %2777 = torch.aten.index_put %2775, %2776, %2768, %false_2220 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %2777, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_2221 = torch.constant.int 26
    %2778 = torch.aten.view.dtype %2777, %int26_2221 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2778, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2779 = torch.aten.detach %2778 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2779, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2780 = torch.aten.detach %2779 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2780, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2222 = torch.constant.int 32
    %int2_2223 = torch.constant.int 2
    %int32_2224 = torch.constant.int 32
    %int8_2225 = torch.constant.int 8
    %int128_2226 = torch.constant.int 128
    %2781 = torch.prim.ListConstruct %776, %int32_2222, %int2_2223, %int32_2224, %int8_2225, %int128_2226 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2782 = torch.aten.view %2780, %2781 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2782, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2227 = torch.constant.int 2097152
    %2783 = torch.prim.ListConstruct %776, %int2097152_2227 : (!torch.int, !torch.int) -> !torch.list<int>
    %2784 = torch.aten.view %2782, %2783 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2784, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_2228 = torch.constant.int -2
    %2785 = torch.aten.unsqueeze %2729, %int-2_2228 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2785, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_2229 = torch.constant.int 4
    %int8_2230 = torch.constant.int 8
    %int4_2231 = torch.constant.int 4
    %int128_2232 = torch.constant.int 128
    %2786 = torch.prim.ListConstruct %int4_2229, %777, %int8_2230, %int4_2231, %int128_2232 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2233 = torch.constant.bool false
    %2787 = torch.aten.expand %2785, %2786, %false_2233 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2787, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_2234 = torch.constant.int 0
    %2788 = torch.aten.clone %2787, %int0_2234 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2788, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_2235 = torch.constant.int 4
    %int32_2236 = torch.constant.int 32
    %int128_2237 = torch.constant.int 128
    %2789 = torch.prim.ListConstruct %int4_2235, %777, %int32_2236, %int128_2237 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2790 = torch.aten._unsafe_view %2788, %2789 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2790, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int-2_2238 = torch.constant.int -2
    %2791 = torch.aten.unsqueeze %2603, %int-2_2238 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2791, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_2239 = torch.constant.int 4
    %int8_2240 = torch.constant.int 8
    %int4_2241 = torch.constant.int 4
    %int128_2242 = torch.constant.int 128
    %2792 = torch.prim.ListConstruct %int4_2239, %777, %int8_2240, %int4_2241, %int128_2242 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2243 = torch.constant.bool false
    %2793 = torch.aten.expand %2791, %2792, %false_2243 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2793, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_2244 = torch.constant.int 0
    %2794 = torch.aten.clone %2793, %int0_2244 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2794, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_2245 = torch.constant.int 4
    %int32_2246 = torch.constant.int 32
    %int128_2247 = torch.constant.int 128
    %2795 = torch.prim.ListConstruct %int4_2245, %777, %int32_2246, %int128_2247 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2796 = torch.aten._unsafe_view %2794, %2795 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2796, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_2248 = torch.constant.int 1
    %int2_2249 = torch.constant.int 2
    %2797 = torch.aten.transpose.int %2666, %int1_2248, %int2_2249 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2797, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_2250 = torch.constant.int 1
    %int2_2251 = torch.constant.int 2
    %2798 = torch.aten.transpose.int %2790, %int1_2250, %int2_2251 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2798, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_2252 = torch.constant.int 1
    %int2_2253 = torch.constant.int 2
    %2799 = torch.aten.transpose.int %2796, %int1_2252, %int2_2253 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2799, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2254 = torch.constant.int 26
    %2800 = torch.prims.convert_element_type %2797, %int26_2254 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2800, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2255 = torch.constant.int 26
    %2801 = torch.prims.convert_element_type %2798, %int26_2255 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2801, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2256 = torch.constant.int 26
    %2802 = torch.prims.convert_element_type %2799, %int26_2256 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2802, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2257 = torch.constant.int 26
    %2803 = torch.prims.convert_element_type %803, %int26_2257 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2803, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    %int0_2258 = torch.constant.int 0
    %int0_2259 = torch.constant.int 0
    %2804 = torch.aten.select.int %2803, %int0_2258, %int0_2259 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2804, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    %int0_2260 = torch.constant.int 0
    %int0_2261 = torch.constant.int 0
    %2805 = torch.aten.select.int %2804, %int0_2260, %int0_2261 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2805, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int0_2262 = torch.constant.int 0
    %int0_2263 = torch.constant.int 0
    %int9223372036854775807_2264 = torch.constant.int 9223372036854775807
    %int1_2265 = torch.constant.int 1
    %2806 = torch.aten.slice.Tensor %2805, %int0_2262, %int0_2263, %int9223372036854775807_2264, %int1_2265 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2806, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int1_2266 = torch.constant.int 1
    %int0_2267 = torch.constant.int 0
    %int9223372036854775807_2268 = torch.constant.int 9223372036854775807
    %int1_2269 = torch.constant.int 1
    %2807 = torch.aten.slice.Tensor %2806, %int1_2266, %int0_2267, %int9223372036854775807_2268, %int1_2269 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2807, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %none_2270 = torch.constant.none
    %2808 = torch.aten.clone %131, %none_2270 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
    %2809 = torch.aten.detach %2808 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2810 = torch.aten.detach %2809 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2811 = torch.aten.detach %2810 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %2812 = torch_c.to_builtin_tensor %2800 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2813 = torch_c.to_builtin_tensor %2801 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2814 = torch_c.to_builtin_tensor %2802 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %2815 = torch_c.to_builtin_tensor %2807 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ>
    %2816 = torch_c.to_builtin_tensor %2811 : !torch.vtensor<[],f32> -> tensor<f32>
    %2817 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%2812, %2813, %2814, %2816, %2815) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %2818 = torch_c.from_builtin_tensor %2817 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32>
    torch.bind_symbolic_shape %2818, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32>
    %int1_2271 = torch.constant.int 1
    %int2_2272 = torch.constant.int 2
    %2819 = torch.aten.transpose.int %2818, %int1_2271, %int2_2272 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %2819, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int0_2273 = torch.constant.int 0
    %2820 = torch.aten.clone %2819, %int0_2273 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %2820, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int4_2274 = torch.constant.int 4
    %int4096_2275 = torch.constant.int 4096
    %2821 = torch.prim.ListConstruct %int4_2274, %777, %int4096_2275 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2822 = torch.aten._unsafe_view %2820, %2821 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2822, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2823 = torch.aten.div.Tensor %2822, %132 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2823, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2276 = torch.constant.float -2.400000e+02
    %float2.400000e02_2277 = torch.constant.float 2.400000e+02
    %2824 = torch.aten.clamp %2823, %float-2.400000e02_2276, %float2.400000e02_2277 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2824, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2278 = torch.constant.int 26
    %2825 = torch.prims.convert_element_type %2824, %int26_2278 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2825, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2279 = torch.constant.int 0
    %2826 = torch.aten.unsqueeze %133, %int0_2279 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_2280 = torch.constant.int 4
    %int4096_2281 = torch.constant.int 4096
    %int4096_2282 = torch.constant.int 4096
    %2827 = torch.prim.ListConstruct %int4_2280, %int4096_2281, %int4096_2282 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2283 = torch.constant.bool false
    %2828 = torch.aten.expand %2826, %2827, %false_2283 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2829 = torch_c.to_builtin_tensor %2825 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2830 = torch_c.to_builtin_tensor %2828 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2831 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2829, %2830) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2832 = torch_c.from_builtin_tensor %2831 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2832, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2833 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2834 = torch.aten.permute %134, %2833 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2835 = torch.aten.mul.Tensor %132, %2834 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2284 = torch.constant.int 6
    %2836 = torch.prims.convert_element_type %2832, %int6_2284 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2836, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2837 = torch.aten.mul.Tensor %2836, %2835 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2837, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_2285 = torch.constant.int 1
    %2838 = torch.aten.add.Tensor %2548, %2837, %int1_2285 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2838, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2286 = torch.constant.int 6
    %2839 = torch.prims.convert_element_type %2838, %int6_2286 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2839, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_2287 = torch.constant.int 2
    %2840 = torch.aten.pow.Tensor_Scalar %2839, %int2_2287 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2840, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_2288 = torch.constant.int -1
    %2841 = torch.prim.ListConstruct %int-1_2288 : (!torch.int) -> !torch.list<int>
    %true_2289 = torch.constant.bool true
    %none_2290 = torch.constant.none
    %2842 = torch.aten.mean.dim %2840, %2841, %true_2289, %none_2290 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2842, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_2291 = torch.constant.float 1.000000e-05
    %int1_2292 = torch.constant.int 1
    %2843 = torch.aten.add.Scalar %2842, %float1.000000e-05_2291, %int1_2292 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2843, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2844 = torch.aten.rsqrt %2843 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2844, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2845 = torch.aten.mul.Tensor %2839, %2844 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2845, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2293 = torch.constant.int 6
    %2846 = torch.prims.convert_element_type %2845, %int6_2293 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2846, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2847 = torch.aten.mul.Tensor %135, %2846 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2847, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2294 = torch.constant.int 6
    %2848 = torch.prims.convert_element_type %2847, %int6_2294 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2848, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2849 = torch.aten.div.Tensor %2848, %136 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2849, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2295 = torch.constant.float -2.400000e+02
    %float2.400000e02_2296 = torch.constant.float 2.400000e+02
    %2850 = torch.aten.clamp %2849, %float-2.400000e02_2295, %float2.400000e02_2296 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2850, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2297 = torch.constant.int 26
    %2851 = torch.prims.convert_element_type %2850, %int26_2297 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2851, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2298 = torch.constant.int 0
    %2852 = torch.aten.unsqueeze %137, %int0_2298 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_2299 = torch.constant.int 4
    %int14336_2300 = torch.constant.int 14336
    %int4096_2301 = torch.constant.int 4096
    %2853 = torch.prim.ListConstruct %int4_2299, %int14336_2300, %int4096_2301 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2302 = torch.constant.bool false
    %2854 = torch.aten.expand %2852, %2853, %false_2302 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2855 = torch_c.to_builtin_tensor %2851 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2856 = torch_c.to_builtin_tensor %2854 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2857 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2855, %2856) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %2858 = torch_c.from_builtin_tensor %2857 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2858, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2859 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2860 = torch.aten.permute %138, %2859 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2861 = torch.aten.mul.Tensor %136, %2860 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2303 = torch.constant.int 6
    %2862 = torch.prims.convert_element_type %2858, %int6_2303 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2862, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2863 = torch.aten.mul.Tensor %2862, %2861 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2863, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2864 = torch.aten.silu %2863 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2864, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2865 = torch.aten.div.Tensor %2848, %139 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2865, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2304 = torch.constant.float -2.400000e+02
    %float2.400000e02_2305 = torch.constant.float 2.400000e+02
    %2866 = torch.aten.clamp %2865, %float-2.400000e02_2304, %float2.400000e02_2305 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2866, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2306 = torch.constant.int 26
    %2867 = torch.prims.convert_element_type %2866, %int26_2306 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2867, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2307 = torch.constant.int 0
    %2868 = torch.aten.unsqueeze %140, %int0_2307 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_2308 = torch.constant.int 4
    %int14336_2309 = torch.constant.int 14336
    %int4096_2310 = torch.constant.int 4096
    %2869 = torch.prim.ListConstruct %int4_2308, %int14336_2309, %int4096_2310 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2311 = torch.constant.bool false
    %2870 = torch.aten.expand %2868, %2869, %false_2311 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %2871 = torch_c.to_builtin_tensor %2867 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2872 = torch_c.to_builtin_tensor %2870 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %2873 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2871, %2872) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %2874 = torch_c.from_builtin_tensor %2873 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2874, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2875 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2876 = torch.aten.permute %141, %2875 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2877 = torch.aten.mul.Tensor %139, %2876 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2312 = torch.constant.int 6
    %2878 = torch.prims.convert_element_type %2874, %int6_2312 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2878, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2879 = torch.aten.mul.Tensor %2878, %2877 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2879, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2880 = torch.aten.mul.Tensor %2864, %2879 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2880, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %2881 = torch.aten.div.Tensor %2880, %142 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2881, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %float-2.400000e02_2313 = torch.constant.float -2.400000e+02
    %float2.400000e02_2314 = torch.constant.float 2.400000e+02
    %2882 = torch.aten.clamp %2881, %float-2.400000e02_2313, %float2.400000e02_2314 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %2882, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %int26_2315 = torch.constant.int 26
    %2883 = torch.prims.convert_element_type %2882, %int26_2315 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2883, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    %int0_2316 = torch.constant.int 0
    %2884 = torch.aten.unsqueeze %143, %int0_2316 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %int4_2317 = torch.constant.int 4
    %int4096_2318 = torch.constant.int 4096
    %int14336_2319 = torch.constant.int 14336
    %2885 = torch.prim.ListConstruct %int4_2317, %int4096_2318, %int14336_2319 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2320 = torch.constant.bool false
    %2886 = torch.aten.expand %2884, %2885, %false_2320 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %2887 = torch_c.to_builtin_tensor %2883 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ>
    %2888 = torch_c.to_builtin_tensor %2886 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ>
    %2889 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%2887, %2888) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2890 = torch_c.from_builtin_tensor %2889 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2890, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2891 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %2892 = torch.aten.permute %144, %2891 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %2893 = torch.aten.mul.Tensor %142, %2892 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2321 = torch.constant.int 6
    %2894 = torch.prims.convert_element_type %2890, %int6_2321 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2894, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2895 = torch.aten.mul.Tensor %2894, %2893 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2895, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_2322 = torch.constant.int 1
    %2896 = torch.aten.add.Tensor %2838, %2895, %int1_2322 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2896, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2323 = torch.constant.int 6
    %2897 = torch.prims.convert_element_type %2896, %int6_2323 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2897, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_2324 = torch.constant.int 2
    %2898 = torch.aten.pow.Tensor_Scalar %2897, %int2_2324 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2898, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_2325 = torch.constant.int -1
    %2899 = torch.prim.ListConstruct %int-1_2325 : (!torch.int) -> !torch.list<int>
    %true_2326 = torch.constant.bool true
    %none_2327 = torch.constant.none
    %2900 = torch.aten.mean.dim %2898, %2899, %true_2326, %none_2327 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2900, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_2328 = torch.constant.float 1.000000e-05
    %int1_2329 = torch.constant.int 1
    %2901 = torch.aten.add.Scalar %2900, %float1.000000e-05_2328, %int1_2329 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2901, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2902 = torch.aten.rsqrt %2901 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %2902, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %2903 = torch.aten.mul.Tensor %2897, %2902 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2903, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2330 = torch.constant.int 6
    %2904 = torch.prims.convert_element_type %2903, %int6_2330 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2904, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2905 = torch.aten.mul.Tensor %145, %2904 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2905, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2331 = torch.constant.int 6
    %2906 = torch.prims.convert_element_type %2905, %int6_2331 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2906, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2907 = torch.aten.div.Tensor %2906, %146 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2907, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2332 = torch.constant.float -2.400000e+02
    %float2.400000e02_2333 = torch.constant.float 2.400000e+02
    %2908 = torch.aten.clamp %2907, %float-2.400000e02_2332, %float2.400000e02_2333 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2908, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2334 = torch.constant.int 26
    %2909 = torch.prims.convert_element_type %2908, %int26_2334 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2909, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2335 = torch.constant.int 0
    %2910 = torch.aten.unsqueeze %147, %int0_2335 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_2336 = torch.constant.int 4
    %int4096_2337 = torch.constant.int 4096
    %int4096_2338 = torch.constant.int 4096
    %2911 = torch.prim.ListConstruct %int4_2336, %int4096_2337, %int4096_2338 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2339 = torch.constant.bool false
    %2912 = torch.aten.expand %2910, %2911, %false_2339 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %2913 = torch_c.to_builtin_tensor %2909 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2914 = torch_c.to_builtin_tensor %2912 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %2915 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2913, %2914) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %2916 = torch_c.from_builtin_tensor %2915 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2916, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %2917 = torch.aten.div.Tensor %2916, %148 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2917, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2340 = torch.constant.float -2.400000e+02
    %float2.400000e02_2341 = torch.constant.float 2.400000e+02
    %2918 = torch.aten.clamp %2917, %float-2.400000e02_2340, %float2.400000e02_2341 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2918, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2342 = torch.constant.int 26
    %2919 = torch.prims.convert_element_type %2918, %int26_2342 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2919, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %2920 = torch.aten.div.Tensor %2906, %149 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2920, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2343 = torch.constant.float -2.400000e+02
    %float2.400000e02_2344 = torch.constant.float 2.400000e+02
    %2921 = torch.aten.clamp %2920, %float-2.400000e02_2343, %float2.400000e02_2344 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2921, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2345 = torch.constant.int 26
    %2922 = torch.prims.convert_element_type %2921, %int26_2345 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2922, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2346 = torch.constant.int 0
    %2923 = torch.aten.unsqueeze %150, %int0_2346 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_2347 = torch.constant.int 4
    %int1024_2348 = torch.constant.int 1024
    %int4096_2349 = torch.constant.int 4096
    %2924 = torch.prim.ListConstruct %int4_2347, %int1024_2348, %int4096_2349 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2350 = torch.constant.bool false
    %2925 = torch.aten.expand %2923, %2924, %false_2350 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2926 = torch_c.to_builtin_tensor %2922 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2927 = torch_c.to_builtin_tensor %2925 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2928 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2926, %2927) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %2929 = torch_c.from_builtin_tensor %2928 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2929, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %2930 = torch.aten.div.Tensor %2929, %151 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2930, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_2351 = torch.constant.float -2.400000e+02
    %float2.400000e02_2352 = torch.constant.float 2.400000e+02
    %2931 = torch.aten.clamp %2930, %float-2.400000e02_2351, %float2.400000e02_2352 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2931, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_2353 = torch.constant.int 26
    %2932 = torch.prims.convert_element_type %2931, %int26_2353 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2932, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %2933 = torch.aten.div.Tensor %2906, %152 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2933, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2354 = torch.constant.float -2.400000e+02
    %float2.400000e02_2355 = torch.constant.float 2.400000e+02
    %2934 = torch.aten.clamp %2933, %float-2.400000e02_2354, %float2.400000e02_2355 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %2934, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2356 = torch.constant.int 26
    %2935 = torch.prims.convert_element_type %2934, %int26_2356 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2935, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2357 = torch.constant.int 0
    %2936 = torch.aten.unsqueeze %153, %int0_2357 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_2358 = torch.constant.int 4
    %int1024_2359 = torch.constant.int 1024
    %int4096_2360 = torch.constant.int 4096
    %2937 = torch.prim.ListConstruct %int4_2358, %int1024_2359, %int4096_2360 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2361 = torch.constant.bool false
    %2938 = torch.aten.expand %2936, %2937, %false_2361 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %2939 = torch_c.to_builtin_tensor %2935 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %2940 = torch_c.to_builtin_tensor %2938 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %2941 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2939, %2940) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %2942 = torch_c.from_builtin_tensor %2941 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2942, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %2943 = torch.aten.div.Tensor %2942, %154 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2943, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_2362 = torch.constant.float -2.400000e+02
    %float2.400000e02_2363 = torch.constant.float 2.400000e+02
    %2944 = torch.aten.clamp %2943, %float-2.400000e02_2362, %float2.400000e02_2363 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %2944, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %int26_2364 = torch.constant.int 26
    %2945 = torch.prims.convert_element_type %2944, %int26_2364 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2945, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>
    %int4_2365 = torch.constant.int 4
    %int32_2366 = torch.constant.int 32
    %int128_2367 = torch.constant.int 128
    %2946 = torch.prim.ListConstruct %int4_2365, %777, %int32_2366, %int128_2367 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2947 = torch.aten.view %2919, %2946 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2947, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int4_2368 = torch.constant.int 4
    %int8_2369 = torch.constant.int 8
    %int128_2370 = torch.constant.int 128
    %2948 = torch.prim.ListConstruct %int4_2368, %777, %int8_2369, %int128_2370 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2949 = torch.aten.view %2932, %2948 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2949, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int4_2371 = torch.constant.int 4
    %int8_2372 = torch.constant.int 8
    %int128_2373 = torch.constant.int 128
    %2950 = torch.prim.ListConstruct %int4_2371, %777, %int8_2372, %int128_2373 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2951 = torch.aten.view %2945, %2950 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2951, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int131072_2374 = torch.constant.int 131072
    %none_2375 = torch.constant.none
    %none_2376 = torch.constant.none
    %cpu_2377 = torch.constant.device "cpu"
    %false_2378 = torch.constant.bool false
    %2952 = torch.aten.arange %int131072_2374, %none_2375, %none_2376, %cpu_2377, %false_2378 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2379 = torch.constant.int 0
    %int128_2380 = torch.constant.int 128
    %int2_2381 = torch.constant.int 2
    %int4_2382 = torch.constant.int 4
    %none_2383 = torch.constant.none
    %cpu_2384 = torch.constant.device "cpu"
    %false_2385 = torch.constant.bool false
    %2953 = torch.aten.arange.start_step %int0_2379, %int128_2380, %int2_2381, %int4_2382, %none_2383, %cpu_2384, %false_2385 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2386 = torch.constant.int 6
    %2954 = torch.prims.convert_element_type %2953, %int6_2386 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2387 = torch.constant.int 128
    %2955 = torch.aten.div.Scalar %2954, %int128_2387 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2388 = torch.constant.float 5.000000e+05
    %2956 = torch.aten.pow.Scalar %float5.000000e05_2388, %2955 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2957 = torch.aten.reciprocal %2956 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2389 = torch.constant.float 1.000000e+00
    %2958 = torch.aten.mul.Scalar %2957, %float1.000000e00_2389 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2959 = torch.aten.reciprocal %2958 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2390 = torch.constant.float 6.2831853071795862
    %2960 = torch.aten.mul.Scalar %2959, %float6.283190e00_2390 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2391 = torch.constant.float 8.192000e+03
    %2961 = torch.aten.gt.Scalar %2960, %float8.192000e03_2391 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2392 = torch.constant.int 8
    %2962 = torch.aten.div.Scalar %2958, %int8_2392 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2963 = torch.aten.where.self %2961, %2962, %2958 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2964 = torch.aten.reciprocal %2960 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2393 = torch.constant.int 8192
    %2965 = torch.aten.mul.Scalar %2964, %int8192_2393 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2394 = torch.constant.int 1
    %int1_2395 = torch.constant.int 1
    %2966 = torch.aten.sub.Scalar %2965, %int1_2394, %int1_2395 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2396 = torch.constant.int 3
    %2967 = torch.aten.div.Scalar %2966, %int3_2396 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2397 = torch.constant.int 1
    %int1_2398 = torch.constant.int 1
    %2968 = torch.aten.rsub.Scalar %2967, %int1_2397, %int1_2398 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2969 = torch.aten.mul.Tensor %2968, %2963 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2399 = torch.constant.int 8
    %2970 = torch.aten.div.Scalar %2969, %int8_2399 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2971 = torch.aten.mul.Tensor %2967, %2963 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2400 = torch.constant.int 1
    %2972 = torch.aten.add.Tensor %2970, %2971, %int1_2400 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2401 = torch.constant.float 2.048000e+03
    %2973 = torch.aten.lt.Scalar %2960, %float2.048000e03_2401 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2974 = torch.aten.bitwise_not %2973 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2402 = torch.constant.float 8.192000e+03
    %2975 = torch.aten.gt.Scalar %2960, %float8.192000e03_2402 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2976 = torch.aten.bitwise_not %2975 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2977 = torch.aten.mul.Tensor %2974, %2976 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2978 = torch.aten.where.self %2977, %2972, %2963 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2979 = torch.prim.ListConstruct %2978, %2978 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2403 = torch.constant.int -1
    %2980 = torch.aten.cat %2979, %int-1_2403 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2404 = torch.constant.int 6
    %2981 = torch.prims.convert_element_type %2980, %int6_2404 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_2405 = torch.constant.int 1
    %2982 = torch.aten.unsqueeze %2952, %int1_2405 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_2406 = torch.constant.int 6
    %2983 = torch.prims.convert_element_type %2982, %int6_2406 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_2407 = torch.constant.int 0
    %2984 = torch.aten.unsqueeze %2981, %int0_2407 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_2408 = torch.constant.int 6
    %2985 = torch.prims.convert_element_type %2984, %int6_2408 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %2986 = torch.aten.mul.Tensor %2983, %2985 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %2987 = torch.aten.cos %2986 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2409 = torch.constant.int 15
    %2988 = torch.prims.convert_element_type %2987, %int15_2409 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2989 = torch.aten.sin %2986 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2410 = torch.constant.int 15
    %2990 = torch.prims.convert_element_type %2989, %int15_2410 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_2411 = torch.constant.int 0
    %int0_2412 = torch.constant.int 0
    %int1_2413 = torch.constant.int 1
    %2991 = torch.aten.slice.Tensor %2988, %int0_2411, %int0_2412, %777, %int1_2413 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2991, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2414 = torch.constant.int 1
    %int0_2415 = torch.constant.int 0
    %int9223372036854775807_2416 = torch.constant.int 9223372036854775807
    %int1_2417 = torch.constant.int 1
    %2992 = torch.aten.slice.Tensor %2991, %int1_2414, %int0_2415, %int9223372036854775807_2416, %int1_2417 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2992, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2418 = torch.constant.int 0
    %int0_2419 = torch.constant.int 0
    %int1_2420 = torch.constant.int 1
    %2993 = torch.aten.slice.Tensor %2990, %int0_2418, %int0_2419, %777, %int1_2420 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2993, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2421 = torch.constant.int 1
    %int0_2422 = torch.constant.int 0
    %int9223372036854775807_2423 = torch.constant.int 9223372036854775807
    %int1_2424 = torch.constant.int 1
    %2994 = torch.aten.slice.Tensor %2993, %int1_2421, %int0_2422, %int9223372036854775807_2423, %int1_2424 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2994, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2425 = torch.constant.int 0
    %2995 = torch.aten.unsqueeze %2992, %int0_2425 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2995, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2426 = torch.constant.int 1
    %int0_2427 = torch.constant.int 0
    %int9223372036854775807_2428 = torch.constant.int 9223372036854775807
    %int1_2429 = torch.constant.int 1
    %2996 = torch.aten.slice.Tensor %2995, %int1_2426, %int0_2427, %int9223372036854775807_2428, %int1_2429 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2996, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2430 = torch.constant.int 2
    %2997 = torch.aten.unsqueeze %2996, %int2_2430 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2997, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2431 = torch.constant.int 3
    %int0_2432 = torch.constant.int 0
    %int9223372036854775807_2433 = torch.constant.int 9223372036854775807
    %int1_2434 = torch.constant.int 1
    %2998 = torch.aten.slice.Tensor %2997, %int3_2431, %int0_2432, %int9223372036854775807_2433, %int1_2434 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2998, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2435 = torch.constant.int 4
    %int1_2436 = torch.constant.int 1
    %int1_2437 = torch.constant.int 1
    %int1_2438 = torch.constant.int 1
    %2999 = torch.prim.ListConstruct %int4_2435, %int1_2436, %int1_2437, %int1_2438 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3000 = torch.aten.repeat %2998, %2999 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %3000, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_2439 = torch.constant.int 0
    %3001 = torch.aten.unsqueeze %2994, %int0_2439 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3001, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2440 = torch.constant.int 1
    %int0_2441 = torch.constant.int 0
    %int9223372036854775807_2442 = torch.constant.int 9223372036854775807
    %int1_2443 = torch.constant.int 1
    %3002 = torch.aten.slice.Tensor %3001, %int1_2440, %int0_2441, %int9223372036854775807_2442, %int1_2443 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3002, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2444 = torch.constant.int 2
    %3003 = torch.aten.unsqueeze %3002, %int2_2444 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3003, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2445 = torch.constant.int 3
    %int0_2446 = torch.constant.int 0
    %int9223372036854775807_2447 = torch.constant.int 9223372036854775807
    %int1_2448 = torch.constant.int 1
    %3004 = torch.aten.slice.Tensor %3003, %int3_2445, %int0_2446, %int9223372036854775807_2447, %int1_2448 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3004, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2449 = torch.constant.int 4
    %int1_2450 = torch.constant.int 1
    %int1_2451 = torch.constant.int 1
    %int1_2452 = torch.constant.int 1
    %3005 = torch.prim.ListConstruct %int4_2449, %int1_2450, %int1_2451, %int1_2452 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3006 = torch.aten.repeat %3004, %3005 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %3006, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %3007 = torch.aten.mul.Tensor %2947, %3000 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3007, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int3_2453 = torch.constant.int 3
    %int0_2454 = torch.constant.int 0
    %int64_2455 = torch.constant.int 64
    %int1_2456 = torch.constant.int 1
    %3008 = torch.aten.slice.Tensor %2947, %int3_2453, %int0_2454, %int64_2455, %int1_2456 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3008, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %int3_2457 = torch.constant.int 3
    %int64_2458 = torch.constant.int 64
    %int9223372036854775807_2459 = torch.constant.int 9223372036854775807
    %int1_2460 = torch.constant.int 1
    %3009 = torch.aten.slice.Tensor %2947, %int3_2457, %int64_2458, %int9223372036854775807_2459, %int1_2460 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3009, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %3010 = torch.aten.neg %3009 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3010, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>
    %3011 = torch.prim.ListConstruct %3010, %3008 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_2461 = torch.constant.int -1
    %3012 = torch.aten.cat %3011, %int-1_2461 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3012, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %3013 = torch.aten.mul.Tensor %3012, %3006 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3013, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_2462 = torch.constant.int 1
    %3014 = torch.aten.add.Tensor %3007, %3013, %int1_2462 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3014, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int131072_2463 = torch.constant.int 131072
    %none_2464 = torch.constant.none
    %none_2465 = torch.constant.none
    %cpu_2466 = torch.constant.device "cpu"
    %false_2467 = torch.constant.bool false
    %3015 = torch.aten.arange %int131072_2463, %none_2464, %none_2465, %cpu_2466, %false_2467 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2468 = torch.constant.int 0
    %int128_2469 = torch.constant.int 128
    %int2_2470 = torch.constant.int 2
    %int4_2471 = torch.constant.int 4
    %none_2472 = torch.constant.none
    %cpu_2473 = torch.constant.device "cpu"
    %false_2474 = torch.constant.bool false
    %3016 = torch.aten.arange.start_step %int0_2468, %int128_2469, %int2_2470, %int4_2471, %none_2472, %cpu_2473, %false_2474 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2475 = torch.constant.int 6
    %3017 = torch.prims.convert_element_type %3016, %int6_2475 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2476 = torch.constant.int 128
    %3018 = torch.aten.div.Scalar %3017, %int128_2476 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2477 = torch.constant.float 5.000000e+05
    %3019 = torch.aten.pow.Scalar %float5.000000e05_2477, %3018 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3020 = torch.aten.reciprocal %3019 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2478 = torch.constant.float 1.000000e+00
    %3021 = torch.aten.mul.Scalar %3020, %float1.000000e00_2478 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %3022 = torch.aten.reciprocal %3021 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2479 = torch.constant.float 6.2831853071795862
    %3023 = torch.aten.mul.Scalar %3022, %float6.283190e00_2479 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2480 = torch.constant.float 8.192000e+03
    %3024 = torch.aten.gt.Scalar %3023, %float8.192000e03_2480 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2481 = torch.constant.int 8
    %3025 = torch.aten.div.Scalar %3021, %int8_2481 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %3026 = torch.aten.where.self %3024, %3025, %3021 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3027 = torch.aten.reciprocal %3023 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2482 = torch.constant.int 8192
    %3028 = torch.aten.mul.Scalar %3027, %int8192_2482 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2483 = torch.constant.int 1
    %int1_2484 = torch.constant.int 1
    %3029 = torch.aten.sub.Scalar %3028, %int1_2483, %int1_2484 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2485 = torch.constant.int 3
    %3030 = torch.aten.div.Scalar %3029, %int3_2485 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2486 = torch.constant.int 1
    %int1_2487 = torch.constant.int 1
    %3031 = torch.aten.rsub.Scalar %3030, %int1_2486, %int1_2487 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %3032 = torch.aten.mul.Tensor %3031, %3026 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2488 = torch.constant.int 8
    %3033 = torch.aten.div.Scalar %3032, %int8_2488 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %3034 = torch.aten.mul.Tensor %3030, %3026 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2489 = torch.constant.int 1
    %3035 = torch.aten.add.Tensor %3033, %3034, %int1_2489 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2490 = torch.constant.float 2.048000e+03
    %3036 = torch.aten.lt.Scalar %3023, %float2.048000e03_2490 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %3037 = torch.aten.bitwise_not %3036 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2491 = torch.constant.float 8.192000e+03
    %3038 = torch.aten.gt.Scalar %3023, %float8.192000e03_2491 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %3039 = torch.aten.bitwise_not %3038 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %3040 = torch.aten.mul.Tensor %3037, %3039 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %3041 = torch.aten.where.self %3040, %3035, %3026 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3042 = torch.prim.ListConstruct %3041, %3041 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2492 = torch.constant.int -1
    %3043 = torch.aten.cat %3042, %int-1_2492 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2493 = torch.constant.int 6
    %3044 = torch.prims.convert_element_type %3043, %int6_2493 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int1_2494 = torch.constant.int 1
    %3045 = torch.aten.unsqueeze %3015, %int1_2494 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64>
    %int6_2495 = torch.constant.int 6
    %3046 = torch.prims.convert_element_type %3045, %int6_2495 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32>
    %int0_2496 = torch.constant.int 0
    %3047 = torch.aten.unsqueeze %3044, %int0_2496 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %int6_2497 = torch.constant.int 6
    %3048 = torch.prims.convert_element_type %3047, %int6_2497 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %3049 = torch.aten.mul.Tensor %3046, %3048 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32>
    %3050 = torch.aten.cos %3049 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2498 = torch.constant.int 15
    %3051 = torch.prims.convert_element_type %3050, %int15_2498 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %3052 = torch.aten.sin %3049 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2499 = torch.constant.int 15
    %3053 = torch.prims.convert_element_type %3052, %int15_2499 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int0_2500 = torch.constant.int 0
    %int0_2501 = torch.constant.int 0
    %int1_2502 = torch.constant.int 1
    %3054 = torch.aten.slice.Tensor %3051, %int0_2500, %int0_2501, %777, %int1_2502 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3054, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2503 = torch.constant.int 1
    %int0_2504 = torch.constant.int 0
    %int9223372036854775807_2505 = torch.constant.int 9223372036854775807
    %int1_2506 = torch.constant.int 1
    %3055 = torch.aten.slice.Tensor %3054, %int1_2503, %int0_2504, %int9223372036854775807_2505, %int1_2506 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3055, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2507 = torch.constant.int 0
    %int0_2508 = torch.constant.int 0
    %int1_2509 = torch.constant.int 1
    %3056 = torch.aten.slice.Tensor %3053, %int0_2507, %int0_2508, %777, %int1_2509 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3056, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2510 = torch.constant.int 1
    %int0_2511 = torch.constant.int 0
    %int9223372036854775807_2512 = torch.constant.int 9223372036854775807
    %int1_2513 = torch.constant.int 1
    %3057 = torch.aten.slice.Tensor %3056, %int1_2510, %int0_2511, %int9223372036854775807_2512, %int1_2513 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3057, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2514 = torch.constant.int 0
    %3058 = torch.aten.unsqueeze %3055, %int0_2514 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3058, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2515 = torch.constant.int 1
    %int0_2516 = torch.constant.int 0
    %int9223372036854775807_2517 = torch.constant.int 9223372036854775807
    %int1_2518 = torch.constant.int 1
    %3059 = torch.aten.slice.Tensor %3058, %int1_2515, %int0_2516, %int9223372036854775807_2517, %int1_2518 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3059, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2519 = torch.constant.int 2
    %3060 = torch.aten.unsqueeze %3059, %int2_2519 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3060, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2520 = torch.constant.int 3
    %int0_2521 = torch.constant.int 0
    %int9223372036854775807_2522 = torch.constant.int 9223372036854775807
    %int1_2523 = torch.constant.int 1
    %3061 = torch.aten.slice.Tensor %3060, %int3_2520, %int0_2521, %int9223372036854775807_2522, %int1_2523 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3061, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2524 = torch.constant.int 4
    %int1_2525 = torch.constant.int 1
    %int1_2526 = torch.constant.int 1
    %int1_2527 = torch.constant.int 1
    %3062 = torch.prim.ListConstruct %int4_2524, %int1_2525, %int1_2526, %int1_2527 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3063 = torch.aten.repeat %3061, %3062 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %3063, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %int0_2528 = torch.constant.int 0
    %3064 = torch.aten.unsqueeze %3057, %int0_2528 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3064, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2529 = torch.constant.int 1
    %int0_2530 = torch.constant.int 0
    %int9223372036854775807_2531 = torch.constant.int 9223372036854775807
    %int1_2532 = torch.constant.int 1
    %3065 = torch.aten.slice.Tensor %3064, %int1_2529, %int0_2530, %int9223372036854775807_2531, %int1_2532 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3065, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2533 = torch.constant.int 2
    %3066 = torch.aten.unsqueeze %3065, %int2_2533 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3066, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2534 = torch.constant.int 3
    %int0_2535 = torch.constant.int 0
    %int9223372036854775807_2536 = torch.constant.int 9223372036854775807
    %int1_2537 = torch.constant.int 1
    %3067 = torch.aten.slice.Tensor %3066, %int3_2534, %int0_2535, %int9223372036854775807_2536, %int1_2537 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3067, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int4_2538 = torch.constant.int 4
    %int1_2539 = torch.constant.int 1
    %int1_2540 = torch.constant.int 1
    %int1_2541 = torch.constant.int 1
    %3068 = torch.prim.ListConstruct %int4_2538, %int1_2539, %int1_2540, %int1_2541 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3069 = torch.aten.repeat %3067, %3068 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16>
    torch.bind_symbolic_shape %3069, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16>
    %3070 = torch.aten.mul.Tensor %2949, %3063 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3070, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int3_2542 = torch.constant.int 3
    %int0_2543 = torch.constant.int 0
    %int64_2544 = torch.constant.int 64
    %int1_2545 = torch.constant.int 1
    %3071 = torch.aten.slice.Tensor %2949, %int3_2542, %int0_2543, %int64_2544, %int1_2545 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3071, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %int3_2546 = torch.constant.int 3
    %int64_2547 = torch.constant.int 64
    %int9223372036854775807_2548 = torch.constant.int 9223372036854775807
    %int1_2549 = torch.constant.int 1
    %3072 = torch.aten.slice.Tensor %2949, %int3_2546, %int64_2547, %int9223372036854775807_2548, %int1_2549 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3072, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %3073 = torch.aten.neg %3072 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3073, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>
    %3074 = torch.prim.ListConstruct %3073, %3071 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor>
    %int-1_2550 = torch.constant.int -1
    %3075 = torch.aten.cat %3074, %int-1_2550 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3075, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %3076 = torch.aten.mul.Tensor %3075, %3069 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3076, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int1_2551 = torch.constant.int 1
    %3077 = torch.aten.add.Tensor %3070, %3076, %int1_2551 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3077, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>
    %int64_2552 = torch.constant.int 64
    %3078 = torch.aten.mul.Scalar %arg2, %int64_2552 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %3078, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int12 = torch.constant.int 12
    %int1_2553 = torch.constant.int 1
    %3079 = torch.aten.add.Scalar %3078, %int12, %int1_2553 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %3079, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %int4_2554 = torch.constant.int 4
    %int32_2555 = torch.constant.int 32
    %int8_2556 = torch.constant.int 8
    %int128_2557 = torch.constant.int 128
    %3080 = torch.prim.ListConstruct %int4_2554, %775, %int32_2555, %int8_2556, %int128_2557 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3081 = torch.aten.view %3077, %3080 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3081, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_2558 = torch.constant.int 32
    %int8_2559 = torch.constant.int 8
    %int128_2560 = torch.constant.int 128
    %3082 = torch.prim.ListConstruct %997, %int32_2558, %int8_2559, %int128_2560 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3083 = torch.aten.view %3081, %3082 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3083, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3084 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %3085 = torch.aten.view %3079, %3084 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %3085, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_2561 = torch.constant.int 26
    %3086 = torch.prims.convert_element_type %3083, %int26_2561 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3086, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2562 = torch.constant.int 1
    %3087 = torch.aten.view.dtype %3086, %int1_2562 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3087, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3088 = torch.aten.detach %3087 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3088, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3089 = torch.aten.detach %3088 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3089, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_2563 = torch.constant.int 32
    %int2_2564 = torch.constant.int 2
    %int32_2565 = torch.constant.int 32
    %int8_2566 = torch.constant.int 8
    %int128_2567 = torch.constant.int 128
    %3090 = torch.prim.ListConstruct %776, %int32_2563, %int2_2564, %int32_2565, %int8_2566, %int128_2567 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3091 = torch.aten.view %2784, %3090 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3091, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2568 = torch.constant.int 32
    %int8_2569 = torch.constant.int 8
    %int128_2570 = torch.constant.int 128
    %3092 = torch.prim.ListConstruct %990, %int32_2568, %int8_2569, %int128_2570 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3093 = torch.aten.view %3091, %3092 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3093, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2571 = torch.constant.int 1
    %3094 = torch.aten.view.dtype %3093, %int1_2571 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3094, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3095 = torch.aten.detach %3094 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3095, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3096 = torch.aten.detach %3095 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3096, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3097 = torch.prim.ListConstruct %3085 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2572 = torch.constant.bool false
    %3098 = torch.aten.index_put %3096, %3097, %3089, %false_2572 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3098, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_2573 = torch.constant.int 26
    %3099 = torch.aten.view.dtype %3098, %int26_2573 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3099, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3100 = torch.aten.detach %3099 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3100, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3101 = torch.aten.detach %3100 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3101, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2574 = torch.constant.int 32
    %int2_2575 = torch.constant.int 2
    %int32_2576 = torch.constant.int 32
    %int8_2577 = torch.constant.int 8
    %int128_2578 = torch.constant.int 128
    %3102 = torch.prim.ListConstruct %776, %int32_2574, %int2_2575, %int32_2576, %int8_2577, %int128_2578 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3103 = torch.aten.view %3101, %3102 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3103, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2579 = torch.constant.int 2097152
    %3104 = torch.prim.ListConstruct %776, %int2097152_2579 : (!torch.int, !torch.int) -> !torch.list<int>
    %3105 = torch.aten.view %3103, %3104 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3105, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int4_2580 = torch.constant.int 4
    %int32_2581 = torch.constant.int 32
    %int8_2582 = torch.constant.int 8
    %int128_2583 = torch.constant.int 128
    %3106 = torch.prim.ListConstruct %int4_2580, %775, %int32_2581, %int8_2582, %int128_2583 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3107 = torch.aten.view %2951, %3106 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3107, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>
    %int32_2584 = torch.constant.int 32
    %int8_2585 = torch.constant.int 8
    %int128_2586 = torch.constant.int 128
    %3108 = torch.prim.ListConstruct %997, %int32_2584, %int8_2585, %int128_2586 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3109 = torch.aten.view %3107, %3108 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3109, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2587 = torch.constant.int 1
    %int1_2588 = torch.constant.int 1
    %3110 = torch.aten.add.Scalar %3079, %int1_2587, %int1_2588 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64>
    torch.bind_symbolic_shape %3110, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
    %3111 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int>
    %3112 = torch.aten.view %3110, %3111 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %3112, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64>
    %int26_2589 = torch.constant.int 26
    %3113 = torch.prims.convert_element_type %3109, %int26_2589 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3113, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2590 = torch.constant.int 1
    %3114 = torch.aten.view.dtype %3113, %int1_2590 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3114, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3115 = torch.aten.detach %3114 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3115, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3116 = torch.aten.detach %3115 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3116, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int32_2591 = torch.constant.int 32
    %int2_2592 = torch.constant.int 2
    %int32_2593 = torch.constant.int 32
    %int8_2594 = torch.constant.int 8
    %int128_2595 = torch.constant.int 128
    %3117 = torch.prim.ListConstruct %776, %int32_2591, %int2_2592, %int32_2593, %int8_2594, %int128_2595 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3118 = torch.aten.view %3105, %3117 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3118, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2596 = torch.constant.int 32
    %int8_2597 = torch.constant.int 8
    %int128_2598 = torch.constant.int 128
    %3119 = torch.prim.ListConstruct %990, %int32_2596, %int8_2597, %int128_2598 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3120 = torch.aten.view %3118, %3119 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3120, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2599 = torch.constant.int 1
    %3121 = torch.aten.view.dtype %3120, %int1_2599 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3121, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3122 = torch.aten.detach %3121 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3122, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3123 = torch.aten.detach %3122 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3123, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %3124 = torch.prim.ListConstruct %3112 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2600 = torch.constant.bool false
    %3125 = torch.aten.index_put %3123, %3124, %3116, %false_2600 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8>
    torch.bind_symbolic_shape %3125, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8>
    %int26_2601 = torch.constant.int 26
    %3126 = torch.aten.view.dtype %3125, %int26_2601 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3126, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3127 = torch.aten.detach %3126 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3127, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3128 = torch.aten.detach %3127 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3128, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2602 = torch.constant.int 32
    %int2_2603 = torch.constant.int 2
    %int32_2604 = torch.constant.int 32
    %int8_2605 = torch.constant.int 8
    %int128_2606 = torch.constant.int 128
    %3129 = torch.prim.ListConstruct %776, %int32_2602, %int2_2603, %int32_2604, %int8_2605, %int128_2606 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3130 = torch.aten.view %3128, %3129 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3130, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2607 = torch.constant.int 2097152
    %3131 = torch.prim.ListConstruct %776, %int2097152_2607 : (!torch.int, !torch.int) -> !torch.list<int>
    %3132 = torch.aten.view %3130, %3131 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3132, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_2608 = torch.constant.int -2
    %3133 = torch.aten.unsqueeze %3077, %int-2_2608 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3133, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_2609 = torch.constant.int 4
    %int8_2610 = torch.constant.int 8
    %int4_2611 = torch.constant.int 4
    %int128_2612 = torch.constant.int 128
    %3134 = torch.prim.ListConstruct %int4_2609, %777, %int8_2610, %int4_2611, %int128_2612 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2613 = torch.constant.bool false
    %3135 = torch.aten.expand %3133, %3134, %false_2613 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3135, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_2614 = torch.constant.int 0
    %3136 = torch.aten.clone %3135, %int0_2614 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3136, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_2615 = torch.constant.int 4
    %int32_2616 = torch.constant.int 32
    %int128_2617 = torch.constant.int 128
    %3137 = torch.prim.ListConstruct %int4_2615, %777, %int32_2616, %int128_2617 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3138 = torch.aten._unsafe_view %3136, %3137 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3138, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int-2_2618 = torch.constant.int -2
    %3139 = torch.aten.unsqueeze %2951, %int-2_2618 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3139, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>
    %int4_2619 = torch.constant.int 4
    %int8_2620 = torch.constant.int 8
    %int4_2621 = torch.constant.int 4
    %int128_2622 = torch.constant.int 128
    %3140 = torch.prim.ListConstruct %int4_2619, %777, %int8_2620, %int4_2621, %int128_2622 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2623 = torch.constant.bool false
    %3141 = torch.aten.expand %3139, %3140, %false_2623 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3141, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int0_2624 = torch.constant.int 0
    %3142 = torch.aten.clone %3141, %int0_2624 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3142, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>
    %int4_2625 = torch.constant.int 4
    %int32_2626 = torch.constant.int 32
    %int128_2627 = torch.constant.int 128
    %3143 = torch.prim.ListConstruct %int4_2625, %777, %int32_2626, %int128_2627 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3144 = torch.aten._unsafe_view %3142, %3143 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3144, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>
    %int1_2628 = torch.constant.int 1
    %int2_2629 = torch.constant.int 2
    %3145 = torch.aten.transpose.int %3014, %int1_2628, %int2_2629 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3145, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_2630 = torch.constant.int 1
    %int2_2631 = torch.constant.int 2
    %3146 = torch.aten.transpose.int %3138, %int1_2630, %int2_2631 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3146, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int1_2632 = torch.constant.int 1
    %int2_2633 = torch.constant.int 2
    %3147 = torch.aten.transpose.int %3144, %int1_2632, %int2_2633 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3147, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2634 = torch.constant.int 26
    %3148 = torch.prims.convert_element_type %3145, %int26_2634 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3148, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2635 = torch.constant.int 26
    %3149 = torch.prims.convert_element_type %3146, %int26_2635 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3149, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2636 = torch.constant.int 26
    %3150 = torch.prims.convert_element_type %3147, %int26_2636 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3150, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>
    %int26_2637 = torch.constant.int 26
    %3151 = torch.prims.convert_element_type %803, %int26_2637 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3151, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>
    %int0_2638 = torch.constant.int 0
    %int0_2639 = torch.constant.int 0
    %3152 = torch.aten.select.int %3151, %int0_2638, %int0_2639 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3152, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ>
    %int0_2640 = torch.constant.int 0
    %int0_2641 = torch.constant.int 0
    %3153 = torch.aten.select.int %3152, %int0_2640, %int0_2641 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3153, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int0_2642 = torch.constant.int 0
    %int0_2643 = torch.constant.int 0
    %int9223372036854775807_2644 = torch.constant.int 9223372036854775807
    %int1_2645 = torch.constant.int 1
    %3154 = torch.aten.slice.Tensor %3153, %int0_2642, %int0_2643, %int9223372036854775807_2644, %int1_2645 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3154, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %int1_2646 = torch.constant.int 1
    %int0_2647 = torch.constant.int 0
    %int9223372036854775807_2648 = torch.constant.int 9223372036854775807
    %int1_2649 = torch.constant.int 1
    %3155 = torch.aten.slice.Tensor %3154, %int1_2646, %int0_2647, %int9223372036854775807_2648, %int1_2649 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3155, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ>
    %none_2650 = torch.constant.none
    %3156 = torch.aten.clone %155, %none_2650 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32>
    %3157 = torch.aten.detach %3156 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %3158 = torch.aten.detach %3157 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %3159 = torch.aten.detach %3158 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %3160 = torch_c.to_builtin_tensor %3148 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3161 = torch_c.to_builtin_tensor %3149 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3162 = torch_c.to_builtin_tensor %3150 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ>
    %3163 = torch_c.to_builtin_tensor %3155 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ>
    %3164 = torch_c.to_builtin_tensor %3159 : !torch.vtensor<[],f32> -> tensor<f32>
    %3165 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%3160, %3161, %3162, %3164, %3163) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32>
    %3166 = torch_c.from_builtin_tensor %3165 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32>
    torch.bind_symbolic_shape %3166, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32>
    %int1_2651 = torch.constant.int 1
    %int2_2652 = torch.constant.int 2
    %3167 = torch.aten.transpose.int %3166, %int1_2651, %int2_2652 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %3167, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int0_2653 = torch.constant.int 0
    %3168 = torch.aten.clone %3167, %int0_2653 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32>
    torch.bind_symbolic_shape %3168, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32>
    %int4_2654 = torch.constant.int 4
    %int4096_2655 = torch.constant.int 4096
    %3169 = torch.prim.ListConstruct %int4_2654, %777, %int4096_2655 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3170 = torch.aten._unsafe_view %3168, %3169 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3170, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3171 = torch.aten.div.Tensor %3170, %156 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3171, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2656 = torch.constant.float -2.400000e+02
    %float2.400000e02_2657 = torch.constant.float 2.400000e+02
    %3172 = torch.aten.clamp %3171, %float-2.400000e02_2656, %float2.400000e02_2657 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3172, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2658 = torch.constant.int 26
    %3173 = torch.prims.convert_element_type %3172, %int26_2658 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3173, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2659 = torch.constant.int 0
    %3174 = torch.aten.unsqueeze %157, %int0_2659 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_2660 = torch.constant.int 4
    %int4096_2661 = torch.constant.int 4096
    %int4096_2662 = torch.constant.int 4096
    %3175 = torch.prim.ListConstruct %int4_2660, %int4096_2661, %int4096_2662 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2663 = torch.constant.bool false
    %3176 = torch.aten.expand %3174, %3175, %false_2663 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %3177 = torch_c.to_builtin_tensor %3173 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %3178 = torch_c.to_builtin_tensor %3176 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %3179 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%3177, %3178) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %3180 = torch_c.from_builtin_tensor %3179 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3180, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3181 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %3182 = torch.aten.permute %158, %3181 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %3183 = torch.aten.mul.Tensor %156, %3182 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2664 = torch.constant.int 6
    %3184 = torch.prims.convert_element_type %3180, %int6_2664 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3184, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3185 = torch.aten.mul.Tensor %3184, %3183 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3185, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_2665 = torch.constant.int 1
    %3186 = torch.aten.add.Tensor %2896, %3185, %int1_2665 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3186, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2666 = torch.constant.int 6
    %3187 = torch.prims.convert_element_type %3186, %int6_2666 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3187, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_2667 = torch.constant.int 2
    %3188 = torch.aten.pow.Tensor_Scalar %3187, %int2_2667 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3188, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_2668 = torch.constant.int -1
    %3189 = torch.prim.ListConstruct %int-1_2668 : (!torch.int) -> !torch.list<int>
    %true_2669 = torch.constant.bool true
    %none_2670 = torch.constant.none
    %3190 = torch.aten.mean.dim %3188, %3189, %true_2669, %none_2670 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %3190, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_2671 = torch.constant.float 1.000000e-05
    %int1_2672 = torch.constant.int 1
    %3191 = torch.aten.add.Scalar %3190, %float1.000000e-05_2671, %int1_2672 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %3191, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %3192 = torch.aten.rsqrt %3191 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %3192, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %3193 = torch.aten.mul.Tensor %3187, %3192 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3193, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2673 = torch.constant.int 6
    %3194 = torch.prims.convert_element_type %3193, %int6_2673 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3194, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3195 = torch.aten.mul.Tensor %159, %3194 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3195, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2674 = torch.constant.int 6
    %3196 = torch.prims.convert_element_type %3195, %int6_2674 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3196, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3197 = torch.aten.div.Tensor %3196, %160 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3197, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2675 = torch.constant.float -2.400000e+02
    %float2.400000e02_2676 = torch.constant.float 2.400000e+02
    %3198 = torch.aten.clamp %3197, %float-2.400000e02_2675, %float2.400000e02_2676 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3198, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2677 = torch.constant.int 26
    %3199 = torch.prims.convert_element_type %3198, %int26_2677 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3199, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2678 = torch.constant.int 0
    %3200 = torch.aten.unsqueeze %161, %int0_2678 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_2679 = torch.constant.int 4
    %int14336_2680 = torch.constant.int 14336
    %int4096_2681 = torch.constant.int 4096
    %3201 = torch.prim.ListConstruct %int4_2679, %int14336_2680, %int4096_2681 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2682 = torch.constant.bool false
    %3202 = torch.aten.expand %3200, %3201, %false_2682 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %3203 = torch_c.to_builtin_tensor %3199 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %3204 = torch_c.to_builtin_tensor %3202 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %3205 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%3203, %3204) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %3206 = torch_c.from_builtin_tensor %3205 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3206, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3207 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %3208 = torch.aten.permute %162, %3207 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %3209 = torch.aten.mul.Tensor %160, %3208 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2683 = torch.constant.int 6
    %3210 = torch.prims.convert_element_type %3206, %int6_2683 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3210, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3211 = torch.aten.mul.Tensor %3210, %3209 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3211, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3212 = torch.aten.silu %3211 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3212, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3213 = torch.aten.div.Tensor %3196, %163 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3213, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2684 = torch.constant.float -2.400000e+02
    %float2.400000e02_2685 = torch.constant.float 2.400000e+02
    %3214 = torch.aten.clamp %3213, %float-2.400000e02_2684, %float2.400000e02_2685 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3214, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2686 = torch.constant.int 26
    %3215 = torch.prims.convert_element_type %3214, %int26_2686 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3215, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2687 = torch.constant.int 0
    %3216 = torch.aten.unsqueeze %164, %int0_2687 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>
    %int4_2688 = torch.constant.int 4
    %int14336_2689 = torch.constant.int 14336
    %int4096_2690 = torch.constant.int 4096
    %3217 = torch.prim.ListConstruct %int4_2688, %int14336_2689, %int4096_2690 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2691 = torch.constant.bool false
    %3218 = torch.aten.expand %3216, %3217, %false_2691 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ>
    %3219 = torch_c.to_builtin_tensor %3215 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %3220 = torch_c.to_builtin_tensor %3218 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ>
    %3221 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%3219, %3220) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32>
    %3222 = torch_c.from_builtin_tensor %3221 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3222, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3223 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %3224 = torch.aten.permute %165, %3223 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %3225 = torch.aten.mul.Tensor %163, %3224 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2692 = torch.constant.int 6
    %3226 = torch.prims.convert_element_type %3222, %int6_2692 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3226, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3227 = torch.aten.mul.Tensor %3226, %3225 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3227, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3228 = torch.aten.mul.Tensor %3212, %3227 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3228, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %3229 = torch.aten.div.Tensor %3228, %166 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3229, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %float-2.400000e02_2693 = torch.constant.float -2.400000e+02
    %float2.400000e02_2694 = torch.constant.float 2.400000e+02
    %3230 = torch.aten.clamp %3229, %float-2.400000e02_2693, %float2.400000e02_2694 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32>
    torch.bind_symbolic_shape %3230, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32>
    %int26_2695 = torch.constant.int 26
    %3231 = torch.prims.convert_element_type %3230, %int26_2695 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3231, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ>
    %int0_2696 = torch.constant.int 0
    %3232 = torch.aten.unsqueeze %167, %int0_2696 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>
    %int4_2697 = torch.constant.int 4
    %int4096_2698 = torch.constant.int 4096
    %int14336_2699 = torch.constant.int 14336
    %3233 = torch.prim.ListConstruct %int4_2697, %int4096_2698, %int14336_2699 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2700 = torch.constant.bool false
    %3234 = torch.aten.expand %3232, %3233, %false_2700 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ>
    %3235 = torch_c.to_builtin_tensor %3231 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ>
    %3236 = torch_c.to_builtin_tensor %3234 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ>
    %3237 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%3235, %3236) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %3238 = torch_c.from_builtin_tensor %3237 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3238, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3239 = torch.prim.ListConstruct  : () -> !torch.list<int>
    %3240 = torch.aten.permute %168, %3239 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32>
    %3241 = torch.aten.mul.Tensor %166, %3240 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32>
    %int6_2701 = torch.constant.int 6
    %3242 = torch.prims.convert_element_type %3238, %int6_2701 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3242, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3243 = torch.aten.mul.Tensor %3242, %3241 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3243, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int1_2702 = torch.constant.int 1
    %3244 = torch.aten.add.Tensor %3186, %3243, %int1_2702 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3244, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2703 = torch.constant.int 6
    %3245 = torch.prims.convert_element_type %3244, %int6_2703 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3245, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int2_2704 = torch.constant.int 2
    %3246 = torch.aten.pow.Tensor_Scalar %3245, %int2_2704 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3246, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int-1_2705 = torch.constant.int -1
    %3247 = torch.prim.ListConstruct %int-1_2705 : (!torch.int) -> !torch.list<int>
    %true_2706 = torch.constant.bool true
    %none_2707 = torch.constant.none
    %3248 = torch.aten.mean.dim %3246, %3247, %true_2706, %none_2707 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %3248, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %float1.000000e-05_2708 = torch.constant.float 1.000000e-05
    %int1_2709 = torch.constant.int 1
    %3249 = torch.aten.add.Scalar %3248, %float1.000000e-05_2708, %int1_2709 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %3249, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %3250 = torch.aten.rsqrt %3249 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32>
    torch.bind_symbolic_shape %3250, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32>
    %3251 = torch.aten.mul.Tensor %3245, %3250 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3251, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2710 = torch.constant.int 6
    %3252 = torch.prims.convert_element_type %3251, %int6_2710 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3252, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3253 = torch.aten.mul.Tensor %169, %3252 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3253, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int6_2711 = torch.constant.int 6
    %3254 = torch.prims.convert_element_type %3253, %int6_2711 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3254, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3255 = torch.aten.div.Tensor %3254, %170 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3255, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2712 = torch.constant.float -2.400000e+02
    %float2.400000e02_2713 = torch.constant.float 2.400000e+02
    %3256 = torch.aten.clamp %3255, %float-2.400000e02_2712, %float2.400000e02_2713 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3256, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2714 = torch.constant.int 26
    %3257 = torch.prims.convert_element_type %3256, %int26_2714 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3257, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2715 = torch.constant.int 0
    %3258 = torch.aten.unsqueeze %171, %int0_2715 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>
    %int4_2716 = torch.constant.int 4
    %int4096_2717 = torch.constant.int 4096
    %int4096_2718 = torch.constant.int 4096
    %3259 = torch.prim.ListConstruct %int4_2716, %int4096_2717, %int4096_2718 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2719 = torch.constant.bool false
    %3260 = torch.aten.expand %3258, %3259, %false_2719 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ>
    %3261 = torch_c.to_builtin_tensor %3257 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %3262 = torch_c.to_builtin_tensor %3260 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ>
    %3263 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%3261, %3262) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32>
    %3264 = torch_c.from_builtin_tensor %3263 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3264, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %3265 = torch.aten.div.Tensor %3264, %172 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3265, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2720 = torch.constant.float -2.400000e+02
    %float2.400000e02_2721 = torch.constant.float 2.400000e+02
    %3266 = torch.aten.clamp %3265, %float-2.400000e02_2720, %float2.400000e02_2721 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3266, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2722 = torch.constant.int 26
    %3267 = torch.prims.convert_element_type %3266, %int26_2722 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3267, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %3268 = torch.aten.div.Tensor %3254, %173 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3268, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %float-2.400000e02_2723 = torch.constant.float -2.400000e+02
    %float2.400000e02_2724 = torch.constant.float 2.400000e+02
    %3269 = torch.aten.clamp %3268, %float-2.400000e02_2723, %float2.400000e02_2724 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32>
    torch.bind_symbolic_shape %3269, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32>
    %int26_2725 = torch.constant.int 26
    %3270 = torch.prims.convert_element_type %3269, %int26_2725 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3270, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>
    %int0_2726 = torch.constant.int 0
    %3271 = torch.aten.unsqueeze %174, %int0_2726 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>
    %int4_2727 = torch.constant.int 4
    %int1024_2728 = torch.constant.int 1024
    %int4096_2729 = torch.constant.int 4096
    %3272 = torch.prim.ListConstruct %int4_2727, %int1024_2728, %int4096_2729 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2730 = torch.constant.bool false
    %3273 = torch.aten.expand %3271, %3272, %false_2730 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ>
    %3274 = torch_c.to_builtin_tensor %3270 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ>
    %3275 = torch_c.to_builtin_tensor %3273 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ>
    %3276 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%3274, %3275) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32>
    %3277 = torch_c.from_builtin_tensor %3276 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %3277, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %3278 = torch.aten.div.Tensor %3277, %175 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %3278, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32>
    %float-2.400000e02_2731 = torch.constant.float -2.400000e+02
    %float2.400000e02_2732 = torch.constant.float 2.400000e+02
    %3279 = torch.aten.clamp %3278, %float-2.400000e02_2731, %float2.400000e02_2732 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32>
    torch.bind_symbolic_shape %3279, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,