AmosLewis · February 2, 2025 19:32
diff --git a/fp8_dan.mlir b/fp8_dan.mlir
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.0.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.0.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.1.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.1.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.1.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.2.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.2.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.2.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.3.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.3.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.3.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.4.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.4.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.4.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.5.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.5.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.5.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.6.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.6.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.6.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.7.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.7.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.7.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.8.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.8.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.8.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.9.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.9.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.9.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.10.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.10.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.10.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.11.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.11.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.11.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.12.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.12.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.12.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.13.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.13.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.13.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.14.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.14.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.14.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.15.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.15.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.15.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.16.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.16.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.16.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.17.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.17.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.17.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.18.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.18.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.18.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.19.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.19.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.19.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.20.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.20.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.20.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.21.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.21.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.21.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.22.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.22.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.22.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.23.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.23.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.23.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.24.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.24.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.24.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.25.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.25.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.25.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.26.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.26.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.26.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.27.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.27.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.27.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.28.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.28.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.28.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.29.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.29.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.29.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.30.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.30.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.30.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.31.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.31.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.31.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16>
  util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16>
  func.func @prefill_bs1(%arg0: !torch.vtensor<[1,?],si64>, %arg1: !torch.vtensor<[1],si64>, %arg2: !torch.vtensor<[1,?],si64>, %arg3: !torch.tensor<[?,2097152],f16>) -> !torch.vtensor<[1,?,128256],f32> attributes {torch.assume_strict_symbolic_shapes} {
    %__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xbf16>
    %0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
    %__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xbf16>
    %1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.0.attn_q.q_input3Arscale = util.global.load @"__auto.blk.0.attn_q.q_input:rscale" : tensor<f32>
    %2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_q.weight3Aqs = util.global.load @"__auto.blk.0.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_k.q_input3Arscale = util.global.load @"__auto.blk.0.attn_k.q_input:rscale" : tensor<f32>
    %4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_k.weight3Aqs = util.global.load @"__auto.blk.0.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %5 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_v.q_input3Arscale = util.global.load @"__auto.blk.0.attn_v.q_input:rscale" : tensor<f32>
    %6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_v.weight3Aqs = util.global.load @"__auto.blk.0.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %7 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.0.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.0.kv_cache.quantizer:rscale" : tensor<f32>
    %8 = torch_c.from_builtin_tensor %__auto.blk.0.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_output.q_input3Arscale = util.global.load @"__auto.blk.0.attn_output.q_input:rscale" : tensor<f32>
    %9 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_output.weight3Aqs = util.global.load @"__auto.blk.0.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %10 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xbf16>
    %11 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.0.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_gate.q_input:rscale" : tensor<f32>
    %12 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.0.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %13 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_up.q_input:rscale" : tensor<f32>
    %14 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_up.weight3Aqs = util.global.load @"__auto.blk.0.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %15 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_down.q_input:rscale" : tensor<f32>
    %16 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_down.weight3Aqs = util.global.load @"__auto.blk.0.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %17 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xbf16>
    %18 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.1.attn_q.q_input3Arscale = util.global.load @"__auto.blk.1.attn_q.q_input:rscale" : tensor<f32>
    %19 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_q.weight3Aqs = util.global.load @"__auto.blk.1.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %20 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_k.q_input3Arscale = util.global.load @"__auto.blk.1.attn_k.q_input:rscale" : tensor<f32>
    %21 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_k.weight3Aqs = util.global.load @"__auto.blk.1.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %22 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_v.q_input3Arscale = util.global.load @"__auto.blk.1.attn_v.q_input:rscale" : tensor<f32>
    %23 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_v.weight3Aqs = util.global.load @"__auto.blk.1.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %24 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.1.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.1.kv_cache.quantizer:rscale" : tensor<f32>
    %25 = torch_c.from_builtin_tensor %__auto.blk.1.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_output.q_input3Arscale = util.global.load @"__auto.blk.1.attn_output.q_input:rscale" : tensor<f32>
    %26 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_output.weight3Aqs = util.global.load @"__auto.blk.1.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %27 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xbf16>
    %28 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.1.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_gate.q_input:rscale" : tensor<f32>
    %29 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.1.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %30 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_up.q_input:rscale" : tensor<f32>
    %31 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_up.weight3Aqs = util.global.load @"__auto.blk.1.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %32 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_down.q_input:rscale" : tensor<f32>
    %33 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_down.weight3Aqs = util.global.load @"__auto.blk.1.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %34 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xbf16>
    %35 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.2.attn_q.q_input3Arscale = util.global.load @"__auto.blk.2.attn_q.q_input:rscale" : tensor<f32>
    %36 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_q.weight3Aqs = util.global.load @"__auto.blk.2.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %37 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_k.q_input3Arscale = util.global.load @"__auto.blk.2.attn_k.q_input:rscale" : tensor<f32>
    %38 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_k.weight3Aqs = util.global.load @"__auto.blk.2.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %39 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_v.q_input3Arscale = util.global.load @"__auto.blk.2.attn_v.q_input:rscale" : tensor<f32>
    %40 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_v.weight3Aqs = util.global.load @"__auto.blk.2.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %41 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.2.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.2.kv_cache.quantizer:rscale" : tensor<f32>
    %42 = torch_c.from_builtin_tensor %__auto.blk.2.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_output.q_input3Arscale = util.global.load @"__auto.blk.2.attn_output.q_input:rscale" : tensor<f32>
    %43 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_output.weight3Aqs = util.global.load @"__auto.blk.2.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %44 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xbf16>
    %45 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.2.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_gate.q_input:rscale" : tensor<f32>
    %46 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.2.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %47 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_up.q_input:rscale" : tensor<f32>
    %48 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_up.weight3Aqs = util.global.load @"__auto.blk.2.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %49 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_down.q_input:rscale" : tensor<f32>
    %50 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_down.weight3Aqs = util.global.load @"__auto.blk.2.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %51 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xbf16>
    %52 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.3.attn_q.q_input3Arscale = util.global.load @"__auto.blk.3.attn_q.q_input:rscale" : tensor<f32>
    %53 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_q.weight3Aqs = util.global.load @"__auto.blk.3.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %54 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_k.q_input3Arscale = util.global.load @"__auto.blk.3.attn_k.q_input:rscale" : tensor<f32>
    %55 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_k.weight3Aqs = util.global.load @"__auto.blk.3.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %56 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_v.q_input3Arscale = util.global.load @"__auto.blk.3.attn_v.q_input:rscale" : tensor<f32>
    %57 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_v.weight3Aqs = util.global.load @"__auto.blk.3.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %58 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.3.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.3.kv_cache.quantizer:rscale" : tensor<f32>
    %59 = torch_c.from_builtin_tensor %__auto.blk.3.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_output.q_input3Arscale = util.global.load @"__auto.blk.3.attn_output.q_input:rscale" : tensor<f32>
    %60 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_output.weight3Aqs = util.global.load @"__auto.blk.3.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %61 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xbf16>
    %62 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.3.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_gate.q_input:rscale" : tensor<f32>
    %63 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.3.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %64 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_up.q_input:rscale" : tensor<f32>
    %65 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_up.weight3Aqs = util.global.load @"__auto.blk.3.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %66 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_down.q_input:rscale" : tensor<f32>
    %67 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_down.weight3Aqs = util.global.load @"__auto.blk.3.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %68 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xbf16>
    %69 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.4.attn_q.q_input3Arscale = util.global.load @"__auto.blk.4.attn_q.q_input:rscale" : tensor<f32>
    %70 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_q.weight3Aqs = util.global.load @"__auto.blk.4.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %71 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_k.q_input3Arscale = util.global.load @"__auto.blk.4.attn_k.q_input:rscale" : tensor<f32>
    %72 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_k.weight3Aqs = util.global.load @"__auto.blk.4.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %73 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_v.q_input3Arscale = util.global.load @"__auto.blk.4.attn_v.q_input:rscale" : tensor<f32>
    %74 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_v.weight3Aqs = util.global.load @"__auto.blk.4.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %75 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.4.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.4.kv_cache.quantizer:rscale" : tensor<f32>
    %76 = torch_c.from_builtin_tensor %__auto.blk.4.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_output.q_input3Arscale = util.global.load @"__auto.blk.4.attn_output.q_input:rscale" : tensor<f32>
    %77 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_output.weight3Aqs = util.global.load @"__auto.blk.4.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %78 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xbf16>
    %79 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.4.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_gate.q_input:rscale" : tensor<f32>
    %80 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.4.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %81 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_up.q_input:rscale" : tensor<f32>
    %82 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_up.weight3Aqs = util.global.load @"__auto.blk.4.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %83 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_down.q_input:rscale" : tensor<f32>
    %84 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_down.weight3Aqs = util.global.load @"__auto.blk.4.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %85 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xbf16>
    %86 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.5.attn_q.q_input3Arscale = util.global.load @"__auto.blk.5.attn_q.q_input:rscale" : tensor<f32>
    %87 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_q.weight3Aqs = util.global.load @"__auto.blk.5.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %88 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_k.q_input3Arscale = util.global.load @"__auto.blk.5.attn_k.q_input:rscale" : tensor<f32>
    %89 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_k.weight3Aqs = util.global.load @"__auto.blk.5.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %90 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_v.q_input3Arscale = util.global.load @"__auto.blk.5.attn_v.q_input:rscale" : tensor<f32>
    %91 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_v.weight3Aqs = util.global.load @"__auto.blk.5.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %92 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.5.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.5.kv_cache.quantizer:rscale" : tensor<f32>
    %93 = torch_c.from_builtin_tensor %__auto.blk.5.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_output.q_input3Arscale = util.global.load @"__auto.blk.5.attn_output.q_input:rscale" : tensor<f32>
    %94 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_output.weight3Aqs = util.global.load @"__auto.blk.5.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %95 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xbf16>
    %96 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.5.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_gate.q_input:rscale" : tensor<f32>
    %97 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.5.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %98 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_up.q_input:rscale" : tensor<f32>
    %99 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_up.weight3Aqs = util.global.load @"__auto.blk.5.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %100 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_down.q_input:rscale" : tensor<f32>
    %101 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_down.weight3Aqs = util.global.load @"__auto.blk.5.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %102 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xbf16>
    %103 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.6.attn_q.q_input3Arscale = util.global.load @"__auto.blk.6.attn_q.q_input:rscale" : tensor<f32>
    %104 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_q.weight3Aqs = util.global.load @"__auto.blk.6.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %105 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_k.q_input3Arscale = util.global.load @"__auto.blk.6.attn_k.q_input:rscale" : tensor<f32>
    %106 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_k.weight3Aqs = util.global.load @"__auto.blk.6.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %107 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_v.q_input3Arscale = util.global.load @"__auto.blk.6.attn_v.q_input:rscale" : tensor<f32>
    %108 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_v.weight3Aqs = util.global.load @"__auto.blk.6.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %109 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.6.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.6.kv_cache.quantizer:rscale" : tensor<f32>
    %110 = torch_c.from_builtin_tensor %__auto.blk.6.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_output.q_input3Arscale = util.global.load @"__auto.blk.6.attn_output.q_input:rscale" : tensor<f32>
    %111 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_output.weight3Aqs = util.global.load @"__auto.blk.6.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %112 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xbf16>
    %113 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.6.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_gate.q_input:rscale" : tensor<f32>
    %114 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.6.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %115 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_up.q_input:rscale" : tensor<f32>
    %116 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_up.weight3Aqs = util.global.load @"__auto.blk.6.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %117 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_down.q_input:rscale" : tensor<f32>
    %118 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_down.weight3Aqs = util.global.load @"__auto.blk.6.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %119 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xbf16>
    %120 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.7.attn_q.q_input3Arscale = util.global.load @"__auto.blk.7.attn_q.q_input:rscale" : tensor<f32>
    %121 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_q.weight3Aqs = util.global.load @"__auto.blk.7.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %122 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_k.q_input3Arscale = util.global.load @"__auto.blk.7.attn_k.q_input:rscale" : tensor<f32>
    %123 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_k.weight3Aqs = util.global.load @"__auto.blk.7.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %124 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_v.q_input3Arscale = util.global.load @"__auto.blk.7.attn_v.q_input:rscale" : tensor<f32>
    %125 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_v.weight3Aqs = util.global.load @"__auto.blk.7.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %126 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.7.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.7.kv_cache.quantizer:rscale" : tensor<f32>
    %127 = torch_c.from_builtin_tensor %__auto.blk.7.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_output.q_input3Arscale = util.global.load @"__auto.blk.7.attn_output.q_input:rscale" : tensor<f32>
    %128 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_output.weight3Aqs = util.global.load @"__auto.blk.7.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %129 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xbf16>
    %130 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.7.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_gate.q_input:rscale" : tensor<f32>
    %131 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.7.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %132 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_up.q_input:rscale" : tensor<f32>
    %133 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_up.weight3Aqs = util.global.load @"__auto.blk.7.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %134 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_down.q_input:rscale" : tensor<f32>
    %135 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_down.weight3Aqs = util.global.load @"__auto.blk.7.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %136 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xbf16>
    %137 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.8.attn_q.q_input3Arscale = util.global.load @"__auto.blk.8.attn_q.q_input:rscale" : tensor<f32>
    %138 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_q.weight3Aqs = util.global.load @"__auto.blk.8.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %139 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_k.q_input3Arscale = util.global.load @"__auto.blk.8.attn_k.q_input:rscale" : tensor<f32>
    %140 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_k.weight3Aqs = util.global.load @"__auto.blk.8.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %141 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_v.q_input3Arscale = util.global.load @"__auto.blk.8.attn_v.q_input:rscale" : tensor<f32>
    %142 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_v.weight3Aqs = util.global.load @"__auto.blk.8.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %143 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.8.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.8.kv_cache.quantizer:rscale" : tensor<f32>
    %144 = torch_c.from_builtin_tensor %__auto.blk.8.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_output.q_input3Arscale = util.global.load @"__auto.blk.8.attn_output.q_input:rscale" : tensor<f32>
    %145 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_output.weight3Aqs = util.global.load @"__auto.blk.8.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %146 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xbf16>
    %147 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.8.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_gate.q_input:rscale" : tensor<f32>
    %148 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.8.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %149 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_up.q_input:rscale" : tensor<f32>
    %150 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_up.weight3Aqs = util.global.load @"__auto.blk.8.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %151 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_down.q_input:rscale" : tensor<f32>
    %152 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_down.weight3Aqs = util.global.load @"__auto.blk.8.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %153 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xbf16>
    %154 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.9.attn_q.q_input3Arscale = util.global.load @"__auto.blk.9.attn_q.q_input:rscale" : tensor<f32>
    %155 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_q.weight3Aqs = util.global.load @"__auto.blk.9.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %156 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_k.q_input3Arscale = util.global.load @"__auto.blk.9.attn_k.q_input:rscale" : tensor<f32>
    %157 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_k.weight3Aqs = util.global.load @"__auto.blk.9.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %158 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_v.q_input3Arscale = util.global.load @"__auto.blk.9.attn_v.q_input:rscale" : tensor<f32>
    %159 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_v.weight3Aqs = util.global.load @"__auto.blk.9.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %160 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.9.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.9.kv_cache.quantizer:rscale" : tensor<f32>
    %161 = torch_c.from_builtin_tensor %__auto.blk.9.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_output.q_input3Arscale = util.global.load @"__auto.blk.9.attn_output.q_input:rscale" : tensor<f32>
    %162 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_output.weight3Aqs = util.global.load @"__auto.blk.9.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %163 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xbf16>
    %164 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.9.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_gate.q_input:rscale" : tensor<f32>
    %165 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.9.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %166 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_up.q_input:rscale" : tensor<f32>
    %167 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_up.weight3Aqs = util.global.load @"__auto.blk.9.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %168 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_down.q_input:rscale" : tensor<f32>
    %169 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_down.weight3Aqs = util.global.load @"__auto.blk.9.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %170 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xbf16>
    %171 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.10.attn_q.q_input3Arscale = util.global.load @"__auto.blk.10.attn_q.q_input:rscale" : tensor<f32>
    %172 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_q.weight3Aqs = util.global.load @"__auto.blk.10.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %173 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_k.q_input3Arscale = util.global.load @"__auto.blk.10.attn_k.q_input:rscale" : tensor<f32>
    %174 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_k.weight3Aqs = util.global.load @"__auto.blk.10.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %175 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_v.q_input3Arscale = util.global.load @"__auto.blk.10.attn_v.q_input:rscale" : tensor<f32>
    %176 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_v.weight3Aqs = util.global.load @"__auto.blk.10.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %177 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.10.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.10.kv_cache.quantizer:rscale" : tensor<f32>
    %178 = torch_c.from_builtin_tensor %__auto.blk.10.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_output.q_input3Arscale = util.global.load @"__auto.blk.10.attn_output.q_input:rscale" : tensor<f32>
    %179 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_output.weight3Aqs = util.global.load @"__auto.blk.10.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %180 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xbf16>
    %181 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.10.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_gate.q_input:rscale" : tensor<f32>
    %182 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.10.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %183 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_up.q_input:rscale" : tensor<f32>
    %184 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_up.weight3Aqs = util.global.load @"__auto.blk.10.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %185 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_down.q_input:rscale" : tensor<f32>
    %186 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_down.weight3Aqs = util.global.load @"__auto.blk.10.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %187 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xbf16>
    %188 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.11.attn_q.q_input3Arscale = util.global.load @"__auto.blk.11.attn_q.q_input:rscale" : tensor<f32>
    %189 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_q.weight3Aqs = util.global.load @"__auto.blk.11.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %190 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_k.q_input3Arscale = util.global.load @"__auto.blk.11.attn_k.q_input:rscale" : tensor<f32>
    %191 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_k.weight3Aqs = util.global.load @"__auto.blk.11.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %192 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_v.q_input3Arscale = util.global.load @"__auto.blk.11.attn_v.q_input:rscale" : tensor<f32>
    %193 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_v.weight3Aqs = util.global.load @"__auto.blk.11.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %194 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.11.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.11.kv_cache.quantizer:rscale" : tensor<f32>
    %195 = torch_c.from_builtin_tensor %__auto.blk.11.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_output.q_input3Arscale = util.global.load @"__auto.blk.11.attn_output.q_input:rscale" : tensor<f32>
    %196 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_output.weight3Aqs = util.global.load @"__auto.blk.11.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %197 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xbf16>
    %198 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.11.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_gate.q_input:rscale" : tensor<f32>
    %199 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.11.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %200 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_up.q_input:rscale" : tensor<f32>
    %201 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_up.weight3Aqs = util.global.load @"__auto.blk.11.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %202 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_down.q_input:rscale" : tensor<f32>
    %203 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_down.weight3Aqs = util.global.load @"__auto.blk.11.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %204 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xbf16>
    %205 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.12.attn_q.q_input3Arscale = util.global.load @"__auto.blk.12.attn_q.q_input:rscale" : tensor<f32>
    %206 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_q.weight3Aqs = util.global.load @"__auto.blk.12.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %207 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_k.q_input3Arscale = util.global.load @"__auto.blk.12.attn_k.q_input:rscale" : tensor<f32>
    %208 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_k.weight3Aqs = util.global.load @"__auto.blk.12.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %209 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_v.q_input3Arscale = util.global.load @"__auto.blk.12.attn_v.q_input:rscale" : tensor<f32>
    %210 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_v.weight3Aqs = util.global.load @"__auto.blk.12.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %211 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.12.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.12.kv_cache.quantizer:rscale" : tensor<f32>
    %212 = torch_c.from_builtin_tensor %__auto.blk.12.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_output.q_input3Arscale = util.global.load @"__auto.blk.12.attn_output.q_input:rscale" : tensor<f32>
    %213 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_output.weight3Aqs = util.global.load @"__auto.blk.12.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %214 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xbf16>
    %215 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.12.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_gate.q_input:rscale" : tensor<f32>
    %216 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.12.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %217 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_up.q_input:rscale" : tensor<f32>
    %218 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_up.weight3Aqs = util.global.load @"__auto.blk.12.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %219 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_down.q_input:rscale" : tensor<f32>
    %220 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_down.weight3Aqs = util.global.load @"__auto.blk.12.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %221 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xbf16>
    %222 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.13.attn_q.q_input3Arscale = util.global.load @"__auto.blk.13.attn_q.q_input:rscale" : tensor<f32>
    %223 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_q.weight3Aqs = util.global.load @"__auto.blk.13.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %224 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_k.q_input3Arscale = util.global.load @"__auto.blk.13.attn_k.q_input:rscale" : tensor<f32>
    %225 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_k.weight3Aqs = util.global.load @"__auto.blk.13.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %226 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_v.q_input3Arscale = util.global.load @"__auto.blk.13.attn_v.q_input:rscale" : tensor<f32>
    %227 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_v.weight3Aqs = util.global.load @"__auto.blk.13.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %228 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.13.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.13.kv_cache.quantizer:rscale" : tensor<f32>
    %229 = torch_c.from_builtin_tensor %__auto.blk.13.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_output.q_input3Arscale = util.global.load @"__auto.blk.13.attn_output.q_input:rscale" : tensor<f32>
    %230 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_output.weight3Aqs = util.global.load @"__auto.blk.13.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %231 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xbf16>
    %232 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.13.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_gate.q_input:rscale" : tensor<f32>
    %233 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.13.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %234 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_up.q_input:rscale" : tensor<f32>
    %235 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_up.weight3Aqs = util.global.load @"__auto.blk.13.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %236 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_down.q_input:rscale" : tensor<f32>
    %237 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_down.weight3Aqs = util.global.load @"__auto.blk.13.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %238 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xbf16>
    %239 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.14.attn_q.q_input3Arscale = util.global.load @"__auto.blk.14.attn_q.q_input:rscale" : tensor<f32>
    %240 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_q.weight3Aqs = util.global.load @"__auto.blk.14.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %241 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_k.q_input3Arscale = util.global.load @"__auto.blk.14.attn_k.q_input:rscale" : tensor<f32>
    %242 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_k.weight3Aqs = util.global.load @"__auto.blk.14.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %243 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_v.q_input3Arscale = util.global.load @"__auto.blk.14.attn_v.q_input:rscale" : tensor<f32>
    %244 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_v.weight3Aqs = util.global.load @"__auto.blk.14.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %245 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.14.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.14.kv_cache.quantizer:rscale" : tensor<f32>
    %246 = torch_c.from_builtin_tensor %__auto.blk.14.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_output.q_input3Arscale = util.global.load @"__auto.blk.14.attn_output.q_input:rscale" : tensor<f32>
    %247 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_output.weight3Aqs = util.global.load @"__auto.blk.14.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %248 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xbf16>
    %249 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.14.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_gate.q_input:rscale" : tensor<f32>
    %250 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.14.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %251 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_up.q_input:rscale" : tensor<f32>
    %252 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_up.weight3Aqs = util.global.load @"__auto.blk.14.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %253 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_down.q_input:rscale" : tensor<f32>
    %254 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_down.weight3Aqs = util.global.load @"__auto.blk.14.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %255 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xbf16>
    %256 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.15.attn_q.q_input3Arscale = util.global.load @"__auto.blk.15.attn_q.q_input:rscale" : tensor<f32>
    %257 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_q.weight3Aqs = util.global.load @"__auto.blk.15.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %258 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_k.q_input3Arscale = util.global.load @"__auto.blk.15.attn_k.q_input:rscale" : tensor<f32>
    %259 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_k.weight3Aqs = util.global.load @"__auto.blk.15.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %260 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_v.q_input3Arscale = util.global.load @"__auto.blk.15.attn_v.q_input:rscale" : tensor<f32>
    %261 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_v.weight3Aqs = util.global.load @"__auto.blk.15.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %262 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.15.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.15.kv_cache.quantizer:rscale" : tensor<f32>
    %263 = torch_c.from_builtin_tensor %__auto.blk.15.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_output.q_input3Arscale = util.global.load @"__auto.blk.15.attn_output.q_input:rscale" : tensor<f32>
    %264 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_output.weight3Aqs = util.global.load @"__auto.blk.15.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %265 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xbf16>
    %266 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.15.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_gate.q_input:rscale" : tensor<f32>
    %267 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.15.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %268 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_up.q_input:rscale" : tensor<f32>
    %269 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_up.weight3Aqs = util.global.load @"__auto.blk.15.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %270 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_down.q_input:rscale" : tensor<f32>
    %271 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_down.weight3Aqs = util.global.load @"__auto.blk.15.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %272 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xbf16>
    %273 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.16.attn_q.q_input3Arscale = util.global.load @"__auto.blk.16.attn_q.q_input:rscale" : tensor<f32>
    %274 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_q.weight3Aqs = util.global.load @"__auto.blk.16.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %275 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_k.q_input3Arscale = util.global.load @"__auto.blk.16.attn_k.q_input:rscale" : tensor<f32>
    %276 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_k.weight3Aqs = util.global.load @"__auto.blk.16.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %277 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_v.q_input3Arscale = util.global.load @"__auto.blk.16.attn_v.q_input:rscale" : tensor<f32>
    %278 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_v.weight3Aqs = util.global.load @"__auto.blk.16.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %279 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.16.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.16.kv_cache.quantizer:rscale" : tensor<f32>
    %280 = torch_c.from_builtin_tensor %__auto.blk.16.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_output.q_input3Arscale = util.global.load @"__auto.blk.16.attn_output.q_input:rscale" : tensor<f32>
    %281 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_output.weight3Aqs = util.global.load @"__auto.blk.16.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %282 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xbf16>
    %283 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.16.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_gate.q_input:rscale" : tensor<f32>
    %284 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.16.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %285 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_up.q_input:rscale" : tensor<f32>
    %286 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_up.weight3Aqs = util.global.load @"__auto.blk.16.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %287 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_down.q_input:rscale" : tensor<f32>
    %288 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_down.weight3Aqs = util.global.load @"__auto.blk.16.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %289 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xbf16>
    %290 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.17.attn_q.q_input3Arscale = util.global.load @"__auto.blk.17.attn_q.q_input:rscale" : tensor<f32>
    %291 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_q.weight3Aqs = util.global.load @"__auto.blk.17.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %292 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_k.q_input3Arscale = util.global.load @"__auto.blk.17.attn_k.q_input:rscale" : tensor<f32>
    %293 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_k.weight3Aqs = util.global.load @"__auto.blk.17.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %294 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_v.q_input3Arscale = util.global.load @"__auto.blk.17.attn_v.q_input:rscale" : tensor<f32>
    %295 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_v.weight3Aqs = util.global.load @"__auto.blk.17.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %296 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.17.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.17.kv_cache.quantizer:rscale" : tensor<f32>
    %297 = torch_c.from_builtin_tensor %__auto.blk.17.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_output.q_input3Arscale = util.global.load @"__auto.blk.17.attn_output.q_input:rscale" : tensor<f32>
    %298 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_output.weight3Aqs = util.global.load @"__auto.blk.17.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %299 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xbf16>
    %300 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.17.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_gate.q_input:rscale" : tensor<f32>
    %301 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.17.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %302 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_up.q_input:rscale" : tensor<f32>
    %303 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_up.weight3Aqs = util.global.load @"__auto.blk.17.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %304 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_down.q_input:rscale" : tensor<f32>
    %305 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_down.weight3Aqs = util.global.load @"__auto.blk.17.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %306 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xbf16>
    %307 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.18.attn_q.q_input3Arscale = util.global.load @"__auto.blk.18.attn_q.q_input:rscale" : tensor<f32>
    %308 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_q.weight3Aqs = util.global.load @"__auto.blk.18.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %309 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_k.q_input3Arscale = util.global.load @"__auto.blk.18.attn_k.q_input:rscale" : tensor<f32>
    %310 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_k.weight3Aqs = util.global.load @"__auto.blk.18.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %311 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_v.q_input3Arscale = util.global.load @"__auto.blk.18.attn_v.q_input:rscale" : tensor<f32>
    %312 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_v.weight3Aqs = util.global.load @"__auto.blk.18.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %313 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.18.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.18.kv_cache.quantizer:rscale" : tensor<f32>
    %314 = torch_c.from_builtin_tensor %__auto.blk.18.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_output.q_input3Arscale = util.global.load @"__auto.blk.18.attn_output.q_input:rscale" : tensor<f32>
    %315 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_output.weight3Aqs = util.global.load @"__auto.blk.18.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %316 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xbf16>
    %317 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.18.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_gate.q_input:rscale" : tensor<f32>
    %318 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.18.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %319 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_up.q_input:rscale" : tensor<f32>
    %320 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_up.weight3Aqs = util.global.load @"__auto.blk.18.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %321 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_down.q_input:rscale" : tensor<f32>
    %322 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_down.weight3Aqs = util.global.load @"__auto.blk.18.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %323 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xbf16>
    %324 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.19.attn_q.q_input3Arscale = util.global.load @"__auto.blk.19.attn_q.q_input:rscale" : tensor<f32>
    %325 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_q.weight3Aqs = util.global.load @"__auto.blk.19.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %326 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_k.q_input3Arscale = util.global.load @"__auto.blk.19.attn_k.q_input:rscale" : tensor<f32>
    %327 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_k.weight3Aqs = util.global.load @"__auto.blk.19.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %328 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_v.q_input3Arscale = util.global.load @"__auto.blk.19.attn_v.q_input:rscale" : tensor<f32>
    %329 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_v.weight3Aqs = util.global.load @"__auto.blk.19.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %330 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.19.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.19.kv_cache.quantizer:rscale" : tensor<f32>
    %331 = torch_c.from_builtin_tensor %__auto.blk.19.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_output.q_input3Arscale = util.global.load @"__auto.blk.19.attn_output.q_input:rscale" : tensor<f32>
    %332 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_output.weight3Aqs = util.global.load @"__auto.blk.19.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %333 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xbf16>
    %334 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.19.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_gate.q_input:rscale" : tensor<f32>
    %335 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.19.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %336 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_up.q_input:rscale" : tensor<f32>
    %337 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_up.weight3Aqs = util.global.load @"__auto.blk.19.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %338 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_down.q_input:rscale" : tensor<f32>
    %339 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_down.weight3Aqs = util.global.load @"__auto.blk.19.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %340 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xbf16>
    %341 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.20.attn_q.q_input3Arscale = util.global.load @"__auto.blk.20.attn_q.q_input:rscale" : tensor<f32>
    %342 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_q.weight3Aqs = util.global.load @"__auto.blk.20.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %343 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_k.q_input3Arscale = util.global.load @"__auto.blk.20.attn_k.q_input:rscale" : tensor<f32>
    %344 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_k.weight3Aqs = util.global.load @"__auto.blk.20.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %345 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_v.q_input3Arscale = util.global.load @"__auto.blk.20.attn_v.q_input:rscale" : tensor<f32>
    %346 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_v.weight3Aqs = util.global.load @"__auto.blk.20.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %347 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.20.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.20.kv_cache.quantizer:rscale" : tensor<f32>
    %348 = torch_c.from_builtin_tensor %__auto.blk.20.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_output.q_input3Arscale = util.global.load @"__auto.blk.20.attn_output.q_input:rscale" : tensor<f32>
    %349 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_output.weight3Aqs = util.global.load @"__auto.blk.20.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %350 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xbf16>
    %351 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.20.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_gate.q_input:rscale" : tensor<f32>
    %352 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.20.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %353 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_up.q_input:rscale" : tensor<f32>
    %354 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_up.weight3Aqs = util.global.load @"__auto.blk.20.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %355 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_down.q_input:rscale" : tensor<f32>
    %356 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_down.weight3Aqs = util.global.load @"__auto.blk.20.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %357 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xbf16>
    %358 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.21.attn_q.q_input3Arscale = util.global.load @"__auto.blk.21.attn_q.q_input:rscale" : tensor<f32>
    %359 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_q.weight3Aqs = util.global.load @"__auto.blk.21.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %360 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_k.q_input3Arscale = util.global.load @"__auto.blk.21.attn_k.q_input:rscale" : tensor<f32>
    %361 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_k.weight3Aqs = util.global.load @"__auto.blk.21.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %362 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_v.q_input3Arscale = util.global.load @"__auto.blk.21.attn_v.q_input:rscale" : tensor<f32>
    %363 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_v.weight3Aqs = util.global.load @"__auto.blk.21.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %364 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.21.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.21.kv_cache.quantizer:rscale" : tensor<f32>
    %365 = torch_c.from_builtin_tensor %__auto.blk.21.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_output.q_input3Arscale = util.global.load @"__auto.blk.21.attn_output.q_input:rscale" : tensor<f32>
    %366 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_output.weight3Aqs = util.global.load @"__auto.blk.21.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %367 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xbf16>
    %368 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.21.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_gate.q_input:rscale" : tensor<f32>
    %369 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.21.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %370 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_up.q_input:rscale" : tensor<f32>
    %371 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_up.weight3Aqs = util.global.load @"__auto.blk.21.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %372 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_down.q_input:rscale" : tensor<f32>
    %373 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_down.weight3Aqs = util.global.load @"__auto.blk.21.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %374 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xbf16>
    %375 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.22.attn_q.q_input3Arscale = util.global.load @"__auto.blk.22.attn_q.q_input:rscale" : tensor<f32>
    %376 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_q.weight3Aqs = util.global.load @"__auto.blk.22.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %377 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_k.q_input3Arscale = util.global.load @"__auto.blk.22.attn_k.q_input:rscale" : tensor<f32>
    %378 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_k.weight3Aqs = util.global.load @"__auto.blk.22.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %379 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_v.q_input3Arscale = util.global.load @"__auto.blk.22.attn_v.q_input:rscale" : tensor<f32>
    %380 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_v.weight3Aqs = util.global.load @"__auto.blk.22.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %381 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.22.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.22.kv_cache.quantizer:rscale" : tensor<f32>
    %382 = torch_c.from_builtin_tensor %__auto.blk.22.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_output.q_input3Arscale = util.global.load @"__auto.blk.22.attn_output.q_input:rscale" : tensor<f32>
    %383 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_output.weight3Aqs = util.global.load @"__auto.blk.22.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %384 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xbf16>
    %385 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.22.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_gate.q_input:rscale" : tensor<f32>
    %386 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.22.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %387 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_up.q_input:rscale" : tensor<f32>
    %388 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_up.weight3Aqs = util.global.load @"__auto.blk.22.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %389 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_down.q_input:rscale" : tensor<f32>
    %390 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_down.weight3Aqs = util.global.load @"__auto.blk.22.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %391 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xbf16>
    %392 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.23.attn_q.q_input3Arscale = util.global.load @"__auto.blk.23.attn_q.q_input:rscale" : tensor<f32>
    %393 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_q.weight3Aqs = util.global.load @"__auto.blk.23.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %394 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_k.q_input3Arscale = util.global.load @"__auto.blk.23.attn_k.q_input:rscale" : tensor<f32>
    %395 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_k.weight3Aqs = util.global.load @"__auto.blk.23.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %396 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_v.q_input3Arscale = util.global.load @"__auto.blk.23.attn_v.q_input:rscale" : tensor<f32>
    %397 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_v.weight3Aqs = util.global.load @"__auto.blk.23.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %398 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.23.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.23.kv_cache.quantizer:rscale" : tensor<f32>
    %399 = torch_c.from_builtin_tensor %__auto.blk.23.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_output.q_input3Arscale = util.global.load @"__auto.blk.23.attn_output.q_input:rscale" : tensor<f32>
    %400 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_output.weight3Aqs = util.global.load @"__auto.blk.23.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %401 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xbf16>
    %402 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.23.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_gate.q_input:rscale" : tensor<f32>
    %403 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.23.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %404 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_up.q_input:rscale" : tensor<f32>
    %405 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_up.weight3Aqs = util.global.load @"__auto.blk.23.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %406 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_down.q_input:rscale" : tensor<f32>
    %407 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_down.weight3Aqs = util.global.load @"__auto.blk.23.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %408 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xbf16>
    %409 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.24.attn_q.q_input3Arscale = util.global.load @"__auto.blk.24.attn_q.q_input:rscale" : tensor<f32>
    %410 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_q.weight3Aqs = util.global.load @"__auto.blk.24.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %411 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_k.q_input3Arscale = util.global.load @"__auto.blk.24.attn_k.q_input:rscale" : tensor<f32>
    %412 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_k.weight3Aqs = util.global.load @"__auto.blk.24.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %413 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_v.q_input3Arscale = util.global.load @"__auto.blk.24.attn_v.q_input:rscale" : tensor<f32>
    %414 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_v.weight3Aqs = util.global.load @"__auto.blk.24.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %415 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.24.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.24.kv_cache.quantizer:rscale" : tensor<f32>
    %416 = torch_c.from_builtin_tensor %__auto.blk.24.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_output.q_input3Arscale = util.global.load @"__auto.blk.24.attn_output.q_input:rscale" : tensor<f32>
    %417 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_output.weight3Aqs = util.global.load @"__auto.blk.24.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %418 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xbf16>
    %419 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.24.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_gate.q_input:rscale" : tensor<f32>
    %420 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.24.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %421 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_up.q_input:rscale" : tensor<f32>
    %422 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_up.weight3Aqs = util.global.load @"__auto.blk.24.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %423 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_down.q_input:rscale" : tensor<f32>
    %424 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_down.weight3Aqs = util.global.load @"__auto.blk.24.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %425 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xbf16>
    %426 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.25.attn_q.q_input3Arscale = util.global.load @"__auto.blk.25.attn_q.q_input:rscale" : tensor<f32>
    %427 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_q.weight3Aqs = util.global.load @"__auto.blk.25.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %428 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_k.q_input3Arscale = util.global.load @"__auto.blk.25.attn_k.q_input:rscale" : tensor<f32>
    %429 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_k.weight3Aqs = util.global.load @"__auto.blk.25.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %430 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_v.q_input3Arscale = util.global.load @"__auto.blk.25.attn_v.q_input:rscale" : tensor<f32>
    %431 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_v.weight3Aqs = util.global.load @"__auto.blk.25.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %432 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.25.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.25.kv_cache.quantizer:rscale" : tensor<f32>
    %433 = torch_c.from_builtin_tensor %__auto.blk.25.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_output.q_input3Arscale = util.global.load @"__auto.blk.25.attn_output.q_input:rscale" : tensor<f32>
    %434 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_output.weight3Aqs = util.global.load @"__auto.blk.25.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %435 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xbf16>
    %436 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.25.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_gate.q_input:rscale" : tensor<f32>
    %437 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.25.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %438 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_up.q_input:rscale" : tensor<f32>
    %439 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_up.weight3Aqs = util.global.load @"__auto.blk.25.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %440 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_down.q_input:rscale" : tensor<f32>
    %441 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_down.weight3Aqs = util.global.load @"__auto.blk.25.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %442 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xbf16>
    %443 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.26.attn_q.q_input3Arscale = util.global.load @"__auto.blk.26.attn_q.q_input:rscale" : tensor<f32>
    %444 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_q.weight3Aqs = util.global.load @"__auto.blk.26.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %445 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_k.q_input3Arscale = util.global.load @"__auto.blk.26.attn_k.q_input:rscale" : tensor<f32>
    %446 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_k.weight3Aqs = util.global.load @"__auto.blk.26.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %447 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_v.q_input3Arscale = util.global.load @"__auto.blk.26.attn_v.q_input:rscale" : tensor<f32>
    %448 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_v.weight3Aqs = util.global.load @"__auto.blk.26.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %449 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.26.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.26.kv_cache.quantizer:rscale" : tensor<f32>
    %450 = torch_c.from_builtin_tensor %__auto.blk.26.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_output.q_input3Arscale = util.global.load @"__auto.blk.26.attn_output.q_input:rscale" : tensor<f32>
    %451 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_output.weight3Aqs = util.global.load @"__auto.blk.26.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %452 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xbf16>
    %453 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.26.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_gate.q_input:rscale" : tensor<f32>
    %454 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.26.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %455 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_up.q_input:rscale" : tensor<f32>
    %456 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_up.weight3Aqs = util.global.load @"__auto.blk.26.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %457 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_down.q_input:rscale" : tensor<f32>
    %458 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_down.weight3Aqs = util.global.load @"__auto.blk.26.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %459 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xbf16>
    %460 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.27.attn_q.q_input3Arscale = util.global.load @"__auto.blk.27.attn_q.q_input:rscale" : tensor<f32>
    %461 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_q.weight3Aqs = util.global.load @"__auto.blk.27.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %462 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_k.q_input3Arscale = util.global.load @"__auto.blk.27.attn_k.q_input:rscale" : tensor<f32>
    %463 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_k.weight3Aqs = util.global.load @"__auto.blk.27.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %464 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_v.q_input3Arscale = util.global.load @"__auto.blk.27.attn_v.q_input:rscale" : tensor<f32>
    %465 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_v.weight3Aqs = util.global.load @"__auto.blk.27.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %466 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.27.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.27.kv_cache.quantizer:rscale" : tensor<f32>
    %467 = torch_c.from_builtin_tensor %__auto.blk.27.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_output.q_input3Arscale = util.global.load @"__auto.blk.27.attn_output.q_input:rscale" : tensor<f32>
    %468 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_output.weight3Aqs = util.global.load @"__auto.blk.27.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %469 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xbf16>
    %470 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.27.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_gate.q_input:rscale" : tensor<f32>
    %471 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.27.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %472 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_up.q_input:rscale" : tensor<f32>
    %473 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_up.weight3Aqs = util.global.load @"__auto.blk.27.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %474 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_down.q_input:rscale" : tensor<f32>
    %475 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_down.weight3Aqs = util.global.load @"__auto.blk.27.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %476 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xbf16>
    %477 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.28.attn_q.q_input3Arscale = util.global.load @"__auto.blk.28.attn_q.q_input:rscale" : tensor<f32>
    %478 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_q.weight3Aqs = util.global.load @"__auto.blk.28.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %479 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_k.q_input3Arscale = util.global.load @"__auto.blk.28.attn_k.q_input:rscale" : tensor<f32>
    %480 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_k.weight3Aqs = util.global.load @"__auto.blk.28.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %481 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_v.q_input3Arscale = util.global.load @"__auto.blk.28.attn_v.q_input:rscale" : tensor<f32>
    %482 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_v.weight3Aqs = util.global.load @"__auto.blk.28.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %483 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.28.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.28.kv_cache.quantizer:rscale" : tensor<f32>
    %484 = torch_c.from_builtin_tensor %__auto.blk.28.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_output.q_input3Arscale = util.global.load @"__auto.blk.28.attn_output.q_input:rscale" : tensor<f32>
    %485 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_output.weight3Aqs = util.global.load @"__auto.blk.28.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %486 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xbf16>
    %487 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.28.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_gate.q_input:rscale" : tensor<f32>
    %488 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.28.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %489 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_up.q_input:rscale" : tensor<f32>
    %490 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_up.weight3Aqs = util.global.load @"__auto.blk.28.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %491 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_down.q_input:rscale" : tensor<f32>
    %492 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_down.weight3Aqs = util.global.load @"__auto.blk.28.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %493 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xbf16>
    %494 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.29.attn_q.q_input3Arscale = util.global.load @"__auto.blk.29.attn_q.q_input:rscale" : tensor<f32>
    %495 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_q.weight3Aqs = util.global.load @"__auto.blk.29.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %496 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_k.q_input3Arscale = util.global.load @"__auto.blk.29.attn_k.q_input:rscale" : tensor<f32>
    %497 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_k.weight3Aqs = util.global.load @"__auto.blk.29.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %498 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_v.q_input3Arscale = util.global.load @"__auto.blk.29.attn_v.q_input:rscale" : tensor<f32>
    %499 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_v.weight3Aqs = util.global.load @"__auto.blk.29.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %500 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.29.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.29.kv_cache.quantizer:rscale" : tensor<f32>
    %501 = torch_c.from_builtin_tensor %__auto.blk.29.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_output.q_input3Arscale = util.global.load @"__auto.blk.29.attn_output.q_input:rscale" : tensor<f32>
    %502 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_output.weight3Aqs = util.global.load @"__auto.blk.29.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %503 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xbf16>
    %504 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.29.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_gate.q_input:rscale" : tensor<f32>
    %505 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.29.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %506 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_up.q_input:rscale" : tensor<f32>
    %507 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_up.weight3Aqs = util.global.load @"__auto.blk.29.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %508 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_down.q_input:rscale" : tensor<f32>
    %509 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_down.weight3Aqs = util.global.load @"__auto.blk.29.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %510 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xbf16>
    %511 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.30.attn_q.q_input3Arscale = util.global.load @"__auto.blk.30.attn_q.q_input:rscale" : tensor<f32>
    %512 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_q.weight3Aqs = util.global.load @"__auto.blk.30.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %513 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_k.q_input3Arscale = util.global.load @"__auto.blk.30.attn_k.q_input:rscale" : tensor<f32>
    %514 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_k.weight3Aqs = util.global.load @"__auto.blk.30.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %515 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_v.q_input3Arscale = util.global.load @"__auto.blk.30.attn_v.q_input:rscale" : tensor<f32>
    %516 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_v.weight3Aqs = util.global.load @"__auto.blk.30.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %517 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.30.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.30.kv_cache.quantizer:rscale" : tensor<f32>
    %518 = torch_c.from_builtin_tensor %__auto.blk.30.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_output.q_input3Arscale = util.global.load @"__auto.blk.30.attn_output.q_input:rscale" : tensor<f32>
    %519 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_output.weight3Aqs = util.global.load @"__auto.blk.30.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %520 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xbf16>
    %521 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.30.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_gate.q_input:rscale" : tensor<f32>
    %522 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.30.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %523 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_up.q_input:rscale" : tensor<f32>
    %524 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_up.weight3Aqs = util.global.load @"__auto.blk.30.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %525 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_down.q_input:rscale" : tensor<f32>
    %526 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_down.weight3Aqs = util.global.load @"__auto.blk.30.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %527 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xbf16>
    %528 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.31.attn_q.q_input3Arscale = util.global.load @"__auto.blk.31.attn_q.q_input:rscale" : tensor<f32>
    %529 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_q.weight3Aqs = util.global.load @"__auto.blk.31.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %530 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_k.q_input3Arscale = util.global.load @"__auto.blk.31.attn_k.q_input:rscale" : tensor<f32>
    %531 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_k.weight3Aqs = util.global.load @"__auto.blk.31.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %532 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_v.q_input3Arscale = util.global.load @"__auto.blk.31.attn_v.q_input:rscale" : tensor<f32>
    %533 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_v.weight3Aqs = util.global.load @"__auto.blk.31.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %534 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.31.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.31.kv_cache.quantizer:rscale" : tensor<f32>
    %535 = torch_c.from_builtin_tensor %__auto.blk.31.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_output.q_input3Arscale = util.global.load @"__auto.blk.31.attn_output.q_input:rscale" : tensor<f32>
    %536 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_output.weight3Aqs = util.global.load @"__auto.blk.31.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %537 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xbf16>
    %538 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.31.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_gate.q_input:rscale" : tensor<f32>
    %539 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.31.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %540 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_up.q_input:rscale" : tensor<f32>
    %541 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_up.weight3Aqs = util.global.load @"__auto.blk.31.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %542 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_down.q_input:rscale" : tensor<f32>
    %543 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_down.weight3Aqs = util.global.load @"__auto.blk.31.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %544 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xbf16>
    %545 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xbf16>
    %546 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
    %547 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,2097152],f16>
    %548 = torch.symbolic_int "s1" {min_val = 2, max_val = 4095} : !torch.int
    %549 = torch.symbolic_int "s2" {min_val = 2, max_val = 9223372036854775806} : !torch.int
    torch.bind_symbolic_shape %arg0, [%548], affine_map<()[s0] -> (1, s0 * 32)> : !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %arg2, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %547, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int5 = torch.constant.int 5
    %550 = torch.prims.convert_element_type %0, %int5 : !torch.vtensor<[128256,4096],bf16>, !torch.int -> !torch.vtensor<[128256,4096],f16>
    %int-1 = torch.constant.int -1
    %false = torch.constant.bool false
    %false_0 = torch.constant.bool false
    %551 = torch.aten.embedding %550, %arg0, %int-1, %false, %false_0 : !torch.vtensor<[128256,4096],f16>, !torch.vtensor<[1,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %551, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %int6 = torch.constant.int 6
    %552 = torch.prims.convert_element_type %551, %int6 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %552, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2 = torch.constant.int 2
    %553 = torch.aten.pow.Tensor_Scalar %552, %int2 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %553, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1 = torch.constant.int -1
    %554 = torch.prim.ListConstruct %int-1_1 : (!torch.int) -> !torch.list<int>
    %true = torch.constant.bool true
    %none = torch.constant.none
    %555 = torch.aten.mean.dim %553, %554, %true, %none : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %555, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05 = torch.constant.float 1.000000e-05
    %int1 = torch.constant.int 1
    %556 = torch.aten.add.Scalar %555, %float1.000000e-05, %int1 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %556, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %557 = torch.aten.rsqrt %556 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %557, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %558 = torch.aten.mul.Tensor %552, %557 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %558, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int5_2 = torch.constant.int 5
    %559 = torch.prims.convert_element_type %558, %int5_2 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %559, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %560 = torch.aten.mul.Tensor %1, %559 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f16> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %560, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int5_3 = torch.constant.int 5
    %561 = torch.prims.convert_element_type %560, %int5_3 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %561, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %562 = torch.aten.div.Tensor %561, %2 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %562, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %float-2.400000e02 = torch.constant.float -2.400000e+02
    %float2.400000e02 = torch.constant.float 2.400000e+02
    %563 = torch.aten.clamp %562, %float-2.400000e02, %float2.400000e02 : !torch.vtensor<[1,?,4096],f16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %563, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %int26 = torch.constant.int 26
    %564 = torch.prims.convert_element_type %563, %int26 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %564, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2 = torch.constant.int -2
    %int-1_4 = torch.constant.int -1
    %565 = torch.aten.transpose.int %3, %int-2, %int-1_4 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int1_5 = torch.constant.int 1
    %566 = torch.aten.size.int %arg0, %int1_5 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.int
    %int4096 = torch.constant.int 4096
    %567 = torch.prim.ListConstruct %566, %int4096 : (!torch.int, !torch.int) -> !torch.list<int>
    %568 = torch.aten.view %564, %567 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %568, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %569 = torch.aten.mm %568, %565 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %569, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_6 = torch.constant.int 1
    %int4096_7 = torch.constant.int 4096
    %570 = torch.prim.ListConstruct %int1_6, %566, %int4096_7 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %571 = torch.aten.view %569, %570 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %571, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15 = torch.constant.int 15
    %572 = torch.prims.convert_element_type %571, %int15 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %572, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %573 = torch.aten.div.Tensor %561, %4 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %573, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %float-2.400000e02_8 = torch.constant.float -2.400000e+02
    %float2.400000e02_9 = torch.constant.float 2.400000e+02
    %574 = torch.aten.clamp %573, %float-2.400000e02_8, %float2.400000e02_9 : !torch.vtensor<[1,?,4096],f16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %574, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %int26_10 = torch.constant.int 26
    %575 = torch.prims.convert_element_type %574, %int26_10 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %575, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_11 = torch.constant.int -2
    %int-1_12 = torch.constant.int -1
    %576 = torch.aten.transpose.int %5, %int-2_11, %int-1_12 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_13 = torch.constant.int 4096
    %577 = torch.prim.ListConstruct %566, %int4096_13 : (!torch.int, !torch.int) -> !torch.list<int>
    %578 = torch.aten.view %575, %577 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %578, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %579 = torch.aten.mm %578, %576 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %579, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_14 = torch.constant.int 1
    %int1024 = torch.constant.int 1024
    %580 = torch.prim.ListConstruct %int1_14, %566, %int1024 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %581 = torch.aten.view %579, %580 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %581, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_15 = torch.constant.int 15
    %582 = torch.prims.convert_element_type %581, %int15_15 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %582, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %583 = torch.aten.div.Tensor %561, %6 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %583, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %float-2.400000e02_16 = torch.constant.float -2.400000e+02
    %float2.400000e02_17 = torch.constant.float 2.400000e+02
    %584 = torch.aten.clamp %583, %float-2.400000e02_16, %float2.400000e02_17 : !torch.vtensor<[1,?,4096],f16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f16>
    torch.bind_symbolic_shape %584, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
    %int26_18 = torch.constant.int 26
    %585 = torch.prims.convert_element_type %584, %int26_18 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %585, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_19 = torch.constant.int -2
    %int-1_20 = torch.constant.int -1
    %586 = torch.aten.transpose.int %7, %int-2_19, %int-1_20 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_21 = torch.constant.int 4096
    %587 = torch.prim.ListConstruct %566, %int4096_21 : (!torch.int, !torch.int) -> !torch.list<int>
    %588 = torch.aten.view %585, %587 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %588, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %589 = torch.aten.mm %588, %586 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %589, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_22 = torch.constant.int 1
    %int1024_23 = torch.constant.int 1024
    %590 = torch.prim.ListConstruct %int1_22, %566, %int1024_23 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %591 = torch.aten.view %589, %590 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %591, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_24 = torch.constant.int 15
    %592 = torch.prims.convert_element_type %591, %int15_24 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %592, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_25 = torch.constant.int 1
    %int32 = torch.constant.int 32
    %int128 = torch.constant.int 128
    %593 = torch.prim.ListConstruct %int1_25, %566, %int32, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %594 = torch.aten.view %572, %593 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %594, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_26 = torch.constant.int 1
    %int8 = torch.constant.int 8
    %int128_27 = torch.constant.int 128
    %595 = torch.prim.ListConstruct %int1_26, %566, %int8, %int128_27 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %596 = torch.aten.view %582, %595 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %596, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_28 = torch.constant.int 1
    %int8_29 = torch.constant.int 8
    %int128_30 = torch.constant.int 128
    %597 = torch.prim.ListConstruct %int1_28, %566, %int8_29, %int128_30 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %598 = torch.aten.view %592, %597 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %598, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072 = torch.constant.int 131072
    %none_31 = torch.constant.none
    %none_32 = torch.constant.none
    %cpu = torch.constant.device "cpu"
    %false_33 = torch.constant.bool false
    %599 = torch.aten.arange %int131072, %none_31, %none_32, %cpu, %false_33 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0 = torch.constant.int 0
    %int128_34 = torch.constant.int 128
    %none_35 = torch.constant.none
    %none_36 = torch.constant.none
    %cpu_37 = torch.constant.device "cpu"
    %false_38 = torch.constant.bool false
    %600 = torch.aten.arange.start %int0, %int128_34, %none_35, %none_36, %cpu_37, %false_38 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_39 = torch.constant.int 2
    %601 = torch.aten.floor_divide.Scalar %600, %int2_39 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_40 = torch.constant.int 6
    %602 = torch.prims.convert_element_type %601, %int6_40 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_41 = torch.constant.int 128
    %603 = torch.aten.div.Scalar %602, %int128_41 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00 = torch.constant.float 2.000000e+00
    %604 = torch.aten.mul.Scalar %603, %float2.000000e00 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05 = torch.constant.float 5.000000e+05
    %605 = torch.aten.pow.Scalar %float5.000000e05, %604 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %606 = torch.aten.reciprocal %605 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00 = torch.constant.float 1.000000e+00
    %607 = torch.aten.mul.Scalar %606, %float1.000000e00 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_42 = torch.constant.int 131072
    %int1_43 = torch.constant.int 1
    %608 = torch.prim.ListConstruct %int131072_42, %int1_43 : (!torch.int, !torch.int) -> !torch.list<int>
    %609 = torch.aten.view %599, %608 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %610 = torch.aten.mul.Tensor %609, %607 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_44 = torch.constant.int 1
    %611 = torch.aten.size.int %571, %int1_44 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_45 = torch.constant.int 0
    %612 = torch.aten.add.int %int0_45, %611 : !torch.int, !torch.int -> !torch.int
    %int0_46 = torch.constant.int 0
    %int0_47 = torch.constant.int 0
    %int1_48 = torch.constant.int 1
    %613 = torch.aten.slice.Tensor %610, %int0_46, %int0_47, %612, %int1_48 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %613, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_49 = torch.constant.int 1
    %int0_50 = torch.constant.int 0
    %int9223372036854775807 = torch.constant.int 9223372036854775807
    %int1_51 = torch.constant.int 1
    %614 = torch.aten.slice.Tensor %613, %int1_49, %int0_50, %int9223372036854775807, %int1_51 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %614, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_52 = torch.constant.int 1
    %int0_53 = torch.constant.int 0
    %int9223372036854775807_54 = torch.constant.int 9223372036854775807
    %int1_55 = torch.constant.int 1
    %615 = torch.aten.slice.Tensor %614, %int1_52, %int0_53, %int9223372036854775807_54, %int1_55 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %615, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_56 = torch.constant.int 0
    %616 = torch.aten.unsqueeze %615, %int0_56 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %616, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_57 = torch.constant.int 1
    %int0_58 = torch.constant.int 0
    %int9223372036854775807_59 = torch.constant.int 9223372036854775807
    %int1_60 = torch.constant.int 1
    %617 = torch.aten.slice.Tensor %616, %int1_57, %int0_58, %int9223372036854775807_59, %int1_60 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %617, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_61 = torch.constant.int 2
    %int0_62 = torch.constant.int 0
    %int9223372036854775807_63 = torch.constant.int 9223372036854775807
    %int1_64 = torch.constant.int 1
    %618 = torch.aten.slice.Tensor %617, %int2_61, %int0_62, %int9223372036854775807_63, %int1_64 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %618, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_65 = torch.constant.int 1
    %int1_66 = torch.constant.int 1
    %int1_67 = torch.constant.int 1
    %619 = torch.prim.ListConstruct %int1_65, %int1_66, %int1_67 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %620 = torch.aten.repeat %618, %619 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %620, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_68 = torch.constant.int 6
    %621 = torch.prims.convert_element_type %594, %int6_68 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %621, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %622 = torch_c.to_builtin_tensor %621 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %623 = torch_c.to_builtin_tensor %620 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %624 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%622, %623) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %625 = torch_c.from_builtin_tensor %624 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %625, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_69 = torch.constant.int 15
    %626 = torch.prims.convert_element_type %625, %int15_69 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %626, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_70 = torch.constant.int 131072
    %none_71 = torch.constant.none
    %none_72 = torch.constant.none
    %cpu_73 = torch.constant.device "cpu"
    %false_74 = torch.constant.bool false
    %627 = torch.aten.arange %int131072_70, %none_71, %none_72, %cpu_73, %false_74 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_75 = torch.constant.int 0
    %int128_76 = torch.constant.int 128
    %none_77 = torch.constant.none
    %none_78 = torch.constant.none
    %cpu_79 = torch.constant.device "cpu"
    %false_80 = torch.constant.bool false
    %628 = torch.aten.arange.start %int0_75, %int128_76, %none_77, %none_78, %cpu_79, %false_80 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_81 = torch.constant.int 2
    %629 = torch.aten.floor_divide.Scalar %628, %int2_81 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_82 = torch.constant.int 6
    %630 = torch.prims.convert_element_type %629, %int6_82 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_83 = torch.constant.int 128
    %631 = torch.aten.div.Scalar %630, %int128_83 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_84 = torch.constant.float 2.000000e+00
    %632 = torch.aten.mul.Scalar %631, %float2.000000e00_84 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_85 = torch.constant.float 5.000000e+05
    %633 = torch.aten.pow.Scalar %float5.000000e05_85, %632 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %634 = torch.aten.reciprocal %633 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_86 = torch.constant.float 1.000000e+00
    %635 = torch.aten.mul.Scalar %634, %float1.000000e00_86 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_87 = torch.constant.int 131072
    %int1_88 = torch.constant.int 1
    %636 = torch.prim.ListConstruct %int131072_87, %int1_88 : (!torch.int, !torch.int) -> !torch.list<int>
    %637 = torch.aten.view %627, %636 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %638 = torch.aten.mul.Tensor %637, %635 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_89 = torch.constant.int 1
    %639 = torch.aten.size.int %581, %int1_89 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_90 = torch.constant.int 0
    %640 = torch.aten.add.int %int0_90, %639 : !torch.int, !torch.int -> !torch.int
    %int0_91 = torch.constant.int 0
    %int0_92 = torch.constant.int 0
    %int1_93 = torch.constant.int 1
    %641 = torch.aten.slice.Tensor %638, %int0_91, %int0_92, %640, %int1_93 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %641, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_94 = torch.constant.int 1
    %int0_95 = torch.constant.int 0
    %int9223372036854775807_96 = torch.constant.int 9223372036854775807
    %int1_97 = torch.constant.int 1
    %642 = torch.aten.slice.Tensor %641, %int1_94, %int0_95, %int9223372036854775807_96, %int1_97 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %642, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_98 = torch.constant.int 1
    %int0_99 = torch.constant.int 0
    %int9223372036854775807_100 = torch.constant.int 9223372036854775807
    %int1_101 = torch.constant.int 1
    %643 = torch.aten.slice.Tensor %642, %int1_98, %int0_99, %int9223372036854775807_100, %int1_101 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %643, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_102 = torch.constant.int 0
    %644 = torch.aten.unsqueeze %643, %int0_102 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %644, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_103 = torch.constant.int 1
    %int0_104 = torch.constant.int 0
    %int9223372036854775807_105 = torch.constant.int 9223372036854775807
    %int1_106 = torch.constant.int 1
    %645 = torch.aten.slice.Tensor %644, %int1_103, %int0_104, %int9223372036854775807_105, %int1_106 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %645, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_107 = torch.constant.int 2
    %int0_108 = torch.constant.int 0
    %int9223372036854775807_109 = torch.constant.int 9223372036854775807
    %int1_110 = torch.constant.int 1
    %646 = torch.aten.slice.Tensor %645, %int2_107, %int0_108, %int9223372036854775807_109, %int1_110 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %646, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_111 = torch.constant.int 1
    %int1_112 = torch.constant.int 1
    %int1_113 = torch.constant.int 1
    %647 = torch.prim.ListConstruct %int1_111, %int1_112, %int1_113 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %648 = torch.aten.repeat %646, %647 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %648, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_114 = torch.constant.int 6
    %649 = torch.prims.convert_element_type %596, %int6_114 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %649, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %650 = torch_c.to_builtin_tensor %649 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %651 = torch_c.to_builtin_tensor %648 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %652 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%650, %651) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %653 = torch_c.from_builtin_tensor %652 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %653, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_115 = torch.constant.int 15
    %654 = torch.prims.convert_element_type %653, %int15_115 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %654, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %655 = torch.aten.div.Tensor %654, %8 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %655, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_116 = torch.constant.float -2.400000e+02
    %float2.400000e02_117 = torch.constant.float 2.400000e+02
    %656 = torch.aten.clamp %655, %float-2.400000e02_116, %float2.400000e02_117 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %656, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_118 = torch.constant.int 26
    %657 = torch.prims.convert_element_type %656, %int26_118 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %657, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %658 = torch.aten.div.Tensor %598, %8 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %658, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_119 = torch.constant.float -2.400000e+02
    %float2.400000e02_120 = torch.constant.float 2.400000e+02
    %659 = torch.aten.clamp %658, %float-2.400000e02_119, %float2.400000e02_120 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %659, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_121 = torch.constant.int 26
    %660 = torch.prims.convert_element_type %659, %int26_121 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %660, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int0_122 = torch.constant.int 0
    %661 = torch.aten.size.int %547, %int0_122 : !torch.vtensor<[?,2097152],f16>, !torch.int -> !torch.int
    %int32_123 = torch.constant.int 32
    %int2_124 = torch.constant.int 2
    %int32_125 = torch.constant.int 32
    %int8_126 = torch.constant.int 8
    %int128_127 = torch.constant.int 128
    %662 = torch.prim.ListConstruct %661, %int32_123, %int2_124, %int32_125, %int8_126, %int128_127 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %663 = torch.aten.view %547, %662 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %663, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_128 = torch.constant.int 32
    %664 = torch.aten.mul.int %661, %int32_128 : !torch.int, !torch.int -> !torch.int
    %int2_129 = torch.constant.int 2
    %665 = torch.aten.mul.int %664, %int2_129 : !torch.int, !torch.int -> !torch.int
    %int32_130 = torch.constant.int 32
    %int8_131 = torch.constant.int 8
    %int128_132 = torch.constant.int 128
    %666 = torch.prim.ListConstruct %665, %int32_130, %int8_131, %int128_132 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %667 = torch.aten.view %663, %666 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %667, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int64 = torch.constant.int 64
    %668 = torch.aten.mul.Scalar %arg2, %int64 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %668, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int0_133 = torch.constant.int 0
    %int1_134 = torch.constant.int 1
    %669 = torch.aten.add.Scalar %668, %int0_133, %int1_134 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %669, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_135 = torch.constant.int 1
    %670 = torch.aten.size.int %arg2, %int1_135 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.int
    %int1_136 = torch.constant.int 1
    %int32_137 = torch.constant.int 32
    %int8_138 = torch.constant.int 8
    %int128_139 = torch.constant.int 128
    %671 = torch.prim.ListConstruct %int1_136, %670, %int32_137, %int8_138, %int128_139 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %672 = torch.aten.view %657, %671 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %672, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_140 = torch.constant.int 32
    %int8_141 = torch.constant.int 8
    %int128_142 = torch.constant.int 128
    %673 = torch.prim.ListConstruct %670, %int32_140, %int8_141, %int128_142 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %674 = torch.aten.view %672, %673 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %674, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %675 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %676 = torch.aten.view %669, %675 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %676, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %677 = torch.prim.ListConstruct %676 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_143 = torch.constant.bool false
    %678 = torch.aten.index_put %667, %677, %674, %false_143 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %678, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_144 = torch.constant.int 32
    %int2_145 = torch.constant.int 2
    %int32_146 = torch.constant.int 32
    %int8_147 = torch.constant.int 8
    %int128_148 = torch.constant.int 128
    %679 = torch.prim.ListConstruct %661, %int32_144, %int2_145, %int32_146, %int8_147, %int128_148 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %680 = torch.aten.view %678, %679 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %680, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152 = torch.constant.int 2097152
    %681 = torch.prim.ListConstruct %661, %int2097152 : (!torch.int, !torch.int) -> !torch.list<int>
    %682 = torch.aten.view %680, %681 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %682, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_149 = torch.constant.int 32
    %int2_150 = torch.constant.int 2
    %int32_151 = torch.constant.int 32
    %int8_152 = torch.constant.int 8
    %int128_153 = torch.constant.int 128
    %683 = torch.prim.ListConstruct %661, %int32_149, %int2_150, %int32_151, %int8_152, %int128_153 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %684 = torch.aten.view %682, %683 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %684, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_154 = torch.constant.int 32
    %int8_155 = torch.constant.int 8
    %int128_156 = torch.constant.int 128
    %685 = torch.prim.ListConstruct %665, %int32_154, %int8_155, %int128_156 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %686 = torch.aten.view %684, %685 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %686, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_157 = torch.constant.int 1
    %int32_158 = torch.constant.int 32
    %int8_159 = torch.constant.int 8
    %int128_160 = torch.constant.int 128
    %687 = torch.prim.ListConstruct %int1_157, %670, %int32_158, %int8_159, %int128_160 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %688 = torch.aten.view %660, %687 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %688, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_161 = torch.constant.int 32
    %int8_162 = torch.constant.int 8
    %int128_163 = torch.constant.int 128
    %689 = torch.prim.ListConstruct %670, %int32_161, %int8_162, %int128_163 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %690 = torch.aten.view %688, %689 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %690, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_164 = torch.constant.int 1
    %int1_165 = torch.constant.int 1
    %691 = torch.aten.add.Scalar %669, %int1_164, %int1_165 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %691, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %692 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %693 = torch.aten.view %691, %692 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %693, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %694 = torch.prim.ListConstruct %693 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_166 = torch.constant.bool false
    %695 = torch.aten.index_put %686, %694, %690, %false_166 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %695, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_167 = torch.constant.int 32
    %int2_168 = torch.constant.int 2
    %int32_169 = torch.constant.int 32
    %int8_170 = torch.constant.int 8
    %int128_171 = torch.constant.int 128
    %696 = torch.prim.ListConstruct %661, %int32_167, %int2_168, %int32_169, %int8_170, %int128_171 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %697 = torch.aten.view %695, %696 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %697, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_172 = torch.constant.int 2097152
    %698 = torch.prim.ListConstruct %661, %int2097152_172 : (!torch.int, !torch.int) -> !torch.list<int>
    %699 = torch.aten.view %697, %698 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %699, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_173 = torch.constant.int -2
    %700 = torch.aten.unsqueeze %657, %int-2_173 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %700, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_174 = torch.constant.int 1
    %int8_175 = torch.constant.int 8
    %int4 = torch.constant.int 4
    %int128_176 = torch.constant.int 128
    %701 = torch.prim.ListConstruct %int1_174, %639, %int8_175, %int4, %int128_176 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_177 = torch.constant.bool false
    %702 = torch.aten.expand %700, %701, %false_177 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %702, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_178 = torch.constant.int 0
    %703 = torch.aten.clone %702, %int0_178 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %703, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_179 = torch.constant.int 1
    %int32_180 = torch.constant.int 32
    %int128_181 = torch.constant.int 128
    %704 = torch.prim.ListConstruct %int1_179, %639, %int32_180, %int128_181 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %705 = torch.aten._unsafe_view %703, %704 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %705, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_182 = torch.constant.int -2
    %706 = torch.aten.unsqueeze %660, %int-2_182 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %706, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_183 = torch.constant.int 1
    %707 = torch.aten.size.int %591, %int1_183 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_184 = torch.constant.int 1
    %int8_185 = torch.constant.int 8
    %int4_186 = torch.constant.int 4
    %int128_187 = torch.constant.int 128
    %708 = torch.prim.ListConstruct %int1_184, %707, %int8_185, %int4_186, %int128_187 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_188 = torch.constant.bool false
    %709 = torch.aten.expand %706, %708, %false_188 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %709, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_189 = torch.constant.int 0
    %710 = torch.aten.clone %709, %int0_189 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %710, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_190 = torch.constant.int 1
    %int32_191 = torch.constant.int 32
    %int128_192 = torch.constant.int 128
    %711 = torch.prim.ListConstruct %int1_190, %707, %int32_191, %int128_192 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %712 = torch.aten._unsafe_view %710, %711 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %712, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_193 = torch.constant.int 6
    %713 = torch.prims.convert_element_type %705, %int6_193 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %713, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %714 = torch.aten.mul.Tensor %713, %8 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %714, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_194 = torch.constant.int 15
    %715 = torch.prims.convert_element_type %714, %int15_194 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %715, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_195 = torch.constant.int 6
    %716 = torch.prims.convert_element_type %712, %int6_195 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %716, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %717 = torch.aten.mul.Tensor %716, %8 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %717, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_196 = torch.constant.int 15
    %718 = torch.prims.convert_element_type %717, %int15_196 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %718, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_197 = torch.constant.int 1
    %int2_198 = torch.constant.int 2
    %719 = torch.aten.transpose.int %626, %int1_197, %int2_198 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %719, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_199 = torch.constant.int 1
    %int2_200 = torch.constant.int 2
    %720 = torch.aten.transpose.int %715, %int1_199, %int2_200 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %720, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_201 = torch.constant.int 1
    %int2_202 = torch.constant.int 2
    %721 = torch.aten.transpose.int %718, %int1_201, %int2_202 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %721, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00 = torch.constant.float 0.000000e+00
    %true_203 = torch.constant.bool true
    %none_204 = torch.constant.none
    %none_205 = torch.constant.none
    %722:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%719, %720, %721, %float0.000000e00, %true_203, %none_204, %none_205) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %722#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_206 = torch.constant.int 1
    %int2_207 = torch.constant.int 2
    %723 = torch.aten.transpose.int %722#0, %int1_206, %int2_207 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %723, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_208 = torch.constant.int 1
    %int4096_209 = torch.constant.int 4096
    %724 = torch.prim.ListConstruct %int1_208, %611, %int4096_209 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %725 = torch.aten.view %723, %724 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %725, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %726 = torch.aten.div.Tensor %725, %9 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %726, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_210 = torch.constant.float -2.400000e+02
    %float2.400000e02_211 = torch.constant.float 2.400000e+02
    %727 = torch.aten.clamp %726, %float-2.400000e02_210, %float2.400000e02_211 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %727, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_212 = torch.constant.int 26
    %728 = torch.prims.convert_element_type %727, %int26_212 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %728, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_213 = torch.constant.int -2
    %int-1_214 = torch.constant.int -1
    %729 = torch.aten.transpose.int %10, %int-2_213, %int-1_214 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_215 = torch.constant.int 4096
    %730 = torch.prim.ListConstruct %611, %int4096_215 : (!torch.int, !torch.int) -> !torch.list<int>
    %731 = torch.aten.view %728, %730 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %731, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %732 = torch.aten.mm %731, %729 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %732, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_216 = torch.constant.int 1
    %int4096_217 = torch.constant.int 4096
    %733 = torch.prim.ListConstruct %int1_216, %611, %int4096_217 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %734 = torch.aten.view %732, %733 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %734, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_218 = torch.constant.int 15
    %735 = torch.prims.convert_element_type %734, %int15_218 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %735, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_219 = torch.constant.int 1
    %736 = torch.aten.add.Tensor %551, %735, %int1_219 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %736, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_220 = torch.constant.int 2
    %737 = torch.aten.pow.Tensor_Scalar %736, %int2_220 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %737, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_221 = torch.constant.int -1
    %738 = torch.prim.ListConstruct %int-1_221 : (!torch.int) -> !torch.list<int>
    %true_222 = torch.constant.bool true
    %none_223 = torch.constant.none
    %739 = torch.aten.mean.dim %737, %738, %true_222, %none_223 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %739, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_224 = torch.constant.float 1.000000e-05
    %int1_225 = torch.constant.int 1
    %740 = torch.aten.add.Scalar %739, %float1.000000e-05_224, %int1_225 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %740, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %741 = torch.aten.rsqrt %740 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %741, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %742 = torch.aten.mul.Tensor %736, %741 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %742, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %743 = torch.aten.mul.Tensor %11, %742 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %743, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %744 = torch.aten.div.Tensor %743, %12 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %744, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_226 = torch.constant.float -2.400000e+02
    %float2.400000e02_227 = torch.constant.float 2.400000e+02
    %745 = torch.aten.clamp %744, %float-2.400000e02_226, %float2.400000e02_227 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %745, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_228 = torch.constant.int 26
    %746 = torch.prims.convert_element_type %745, %int26_228 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %746, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_229 = torch.constant.int -2
    %int-1_230 = torch.constant.int -1
    %747 = torch.aten.transpose.int %13, %int-2_229, %int-1_230 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_231 = torch.constant.int 4096
    %748 = torch.prim.ListConstruct %566, %int4096_231 : (!torch.int, !torch.int) -> !torch.list<int>
    %749 = torch.aten.view %746, %748 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %749, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %750 = torch.aten.mm %749, %747 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %750, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_232 = torch.constant.int 1
    %int14336 = torch.constant.int 14336
    %751 = torch.prim.ListConstruct %int1_232, %566, %int14336 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %752 = torch.aten.view %750, %751 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %752, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_233 = torch.constant.int 15
    %753 = torch.prims.convert_element_type %752, %int15_233 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %753, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %754 = torch.aten.silu %753 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %754, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %755 = torch.aten.div.Tensor %743, %14 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %755, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_234 = torch.constant.float -2.400000e+02
    %float2.400000e02_235 = torch.constant.float 2.400000e+02
    %756 = torch.aten.clamp %755, %float-2.400000e02_234, %float2.400000e02_235 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %756, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_236 = torch.constant.int 26
    %757 = torch.prims.convert_element_type %756, %int26_236 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %757, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_237 = torch.constant.int -2
    %int-1_238 = torch.constant.int -1
    %758 = torch.aten.transpose.int %15, %int-2_237, %int-1_238 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_239 = torch.constant.int 4096
    %759 = torch.prim.ListConstruct %566, %int4096_239 : (!torch.int, !torch.int) -> !torch.list<int>
    %760 = torch.aten.view %757, %759 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %760, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %761 = torch.aten.mm %760, %758 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %761, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_240 = torch.constant.int 1
    %int14336_241 = torch.constant.int 14336
    %762 = torch.prim.ListConstruct %int1_240, %566, %int14336_241 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %763 = torch.aten.view %761, %762 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %763, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_242 = torch.constant.int 15
    %764 = torch.prims.convert_element_type %763, %int15_242 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %764, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %765 = torch.aten.mul.Tensor %754, %764 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %765, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %766 = torch.aten.div.Tensor %765, %16 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %766, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_243 = torch.constant.float -2.400000e+02
    %float2.400000e02_244 = torch.constant.float 2.400000e+02
    %767 = torch.aten.clamp %766, %float-2.400000e02_243, %float2.400000e02_244 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %767, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_245 = torch.constant.int 26
    %768 = torch.prims.convert_element_type %767, %int26_245 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %768, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_246 = torch.constant.int -2
    %int-1_247 = torch.constant.int -1
    %769 = torch.aten.transpose.int %17, %int-2_246, %int-1_247 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_248 = torch.constant.int 1
    %770 = torch.aten.size.int %752, %int1_248 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_249 = torch.constant.int 14336
    %771 = torch.prim.ListConstruct %770, %int14336_249 : (!torch.int, !torch.int) -> !torch.list<int>
    %772 = torch.aten.view %768, %771 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %772, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %773 = torch.aten.mm %772, %769 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %773, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_250 = torch.constant.int 1
    %int4096_251 = torch.constant.int 4096
    %774 = torch.prim.ListConstruct %int1_250, %770, %int4096_251 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %775 = torch.aten.view %773, %774 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %775, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_252 = torch.constant.int 15
    %776 = torch.prims.convert_element_type %775, %int15_252 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %776, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_253 = torch.constant.int 1
    %777 = torch.aten.add.Tensor %736, %776, %int1_253 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %777, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_254 = torch.constant.int 2
    %778 = torch.aten.pow.Tensor_Scalar %777, %int2_254 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %778, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_255 = torch.constant.int -1
    %779 = torch.prim.ListConstruct %int-1_255 : (!torch.int) -> !torch.list<int>
    %true_256 = torch.constant.bool true
    %none_257 = torch.constant.none
    %780 = torch.aten.mean.dim %778, %779, %true_256, %none_257 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %780, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_258 = torch.constant.float 1.000000e-05
    %int1_259 = torch.constant.int 1
    %781 = torch.aten.add.Scalar %780, %float1.000000e-05_258, %int1_259 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %781, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %782 = torch.aten.rsqrt %781 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %782, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %783 = torch.aten.mul.Tensor %777, %782 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %783, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %784 = torch.aten.mul.Tensor %18, %783 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %784, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %785 = torch.aten.div.Tensor %784, %19 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %785, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_260 = torch.constant.float -2.400000e+02
    %float2.400000e02_261 = torch.constant.float 2.400000e+02
    %786 = torch.aten.clamp %785, %float-2.400000e02_260, %float2.400000e02_261 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %786, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_262 = torch.constant.int 26
    %787 = torch.prims.convert_element_type %786, %int26_262 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %787, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_263 = torch.constant.int -2
    %int-1_264 = torch.constant.int -1
    %788 = torch.aten.transpose.int %20, %int-2_263, %int-1_264 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_265 = torch.constant.int 4096
    %789 = torch.prim.ListConstruct %566, %int4096_265 : (!torch.int, !torch.int) -> !torch.list<int>
    %790 = torch.aten.view %787, %789 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %790, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %791 = torch.aten.mm %790, %788 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %791, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_266 = torch.constant.int 1
    %int4096_267 = torch.constant.int 4096
    %792 = torch.prim.ListConstruct %int1_266, %566, %int4096_267 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %793 = torch.aten.view %791, %792 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %793, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_268 = torch.constant.int 15
    %794 = torch.prims.convert_element_type %793, %int15_268 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %794, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %795 = torch.aten.div.Tensor %784, %21 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %795, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_269 = torch.constant.float -2.400000e+02
    %float2.400000e02_270 = torch.constant.float 2.400000e+02
    %796 = torch.aten.clamp %795, %float-2.400000e02_269, %float2.400000e02_270 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %796, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_271 = torch.constant.int 26
    %797 = torch.prims.convert_element_type %796, %int26_271 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %797, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_272 = torch.constant.int -2
    %int-1_273 = torch.constant.int -1
    %798 = torch.aten.transpose.int %22, %int-2_272, %int-1_273 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_274 = torch.constant.int 4096
    %799 = torch.prim.ListConstruct %566, %int4096_274 : (!torch.int, !torch.int) -> !torch.list<int>
    %800 = torch.aten.view %797, %799 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %800, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %801 = torch.aten.mm %800, %798 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %801, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_275 = torch.constant.int 1
    %int1024_276 = torch.constant.int 1024
    %802 = torch.prim.ListConstruct %int1_275, %566, %int1024_276 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %803 = torch.aten.view %801, %802 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %803, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_277 = torch.constant.int 15
    %804 = torch.prims.convert_element_type %803, %int15_277 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %804, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %805 = torch.aten.div.Tensor %784, %23 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %805, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_278 = torch.constant.float -2.400000e+02
    %float2.400000e02_279 = torch.constant.float 2.400000e+02
    %806 = torch.aten.clamp %805, %float-2.400000e02_278, %float2.400000e02_279 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %806, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_280 = torch.constant.int 26
    %807 = torch.prims.convert_element_type %806, %int26_280 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %807, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_281 = torch.constant.int -2
    %int-1_282 = torch.constant.int -1
    %808 = torch.aten.transpose.int %24, %int-2_281, %int-1_282 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_283 = torch.constant.int 4096
    %809 = torch.prim.ListConstruct %566, %int4096_283 : (!torch.int, !torch.int) -> !torch.list<int>
    %810 = torch.aten.view %807, %809 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %810, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %811 = torch.aten.mm %810, %808 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %811, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_284 = torch.constant.int 1
    %int1024_285 = torch.constant.int 1024
    %812 = torch.prim.ListConstruct %int1_284, %566, %int1024_285 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %813 = torch.aten.view %811, %812 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %813, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_286 = torch.constant.int 15
    %814 = torch.prims.convert_element_type %813, %int15_286 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %814, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_287 = torch.constant.int 1
    %int32_288 = torch.constant.int 32
    %int128_289 = torch.constant.int 128
    %815 = torch.prim.ListConstruct %int1_287, %566, %int32_288, %int128_289 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %816 = torch.aten.view %794, %815 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %816, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_290 = torch.constant.int 1
    %int8_291 = torch.constant.int 8
    %int128_292 = torch.constant.int 128
    %817 = torch.prim.ListConstruct %int1_290, %566, %int8_291, %int128_292 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %818 = torch.aten.view %804, %817 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %818, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_293 = torch.constant.int 1
    %int8_294 = torch.constant.int 8
    %int128_295 = torch.constant.int 128
    %819 = torch.prim.ListConstruct %int1_293, %566, %int8_294, %int128_295 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %820 = torch.aten.view %814, %819 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %820, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_296 = torch.constant.int 131072
    %none_297 = torch.constant.none
    %none_298 = torch.constant.none
    %cpu_299 = torch.constant.device "cpu"
    %false_300 = torch.constant.bool false
    %821 = torch.aten.arange %int131072_296, %none_297, %none_298, %cpu_299, %false_300 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_301 = torch.constant.int 0
    %int128_302 = torch.constant.int 128
    %none_303 = torch.constant.none
    %none_304 = torch.constant.none
    %cpu_305 = torch.constant.device "cpu"
    %false_306 = torch.constant.bool false
    %822 = torch.aten.arange.start %int0_301, %int128_302, %none_303, %none_304, %cpu_305, %false_306 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_307 = torch.constant.int 2
    %823 = torch.aten.floor_divide.Scalar %822, %int2_307 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_308 = torch.constant.int 6
    %824 = torch.prims.convert_element_type %823, %int6_308 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_309 = torch.constant.int 128
    %825 = torch.aten.div.Scalar %824, %int128_309 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_310 = torch.constant.float 2.000000e+00
    %826 = torch.aten.mul.Scalar %825, %float2.000000e00_310 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_311 = torch.constant.float 5.000000e+05
    %827 = torch.aten.pow.Scalar %float5.000000e05_311, %826 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %828 = torch.aten.reciprocal %827 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_312 = torch.constant.float 1.000000e+00
    %829 = torch.aten.mul.Scalar %828, %float1.000000e00_312 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_313 = torch.constant.int 131072
    %int1_314 = torch.constant.int 1
    %830 = torch.prim.ListConstruct %int131072_313, %int1_314 : (!torch.int, !torch.int) -> !torch.list<int>
    %831 = torch.aten.view %821, %830 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %832 = torch.aten.mul.Tensor %831, %829 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_315 = torch.constant.int 1
    %833 = torch.aten.size.int %793, %int1_315 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_316 = torch.constant.int 0
    %834 = torch.aten.add.int %int0_316, %833 : !torch.int, !torch.int -> !torch.int
    %int0_317 = torch.constant.int 0
    %int0_318 = torch.constant.int 0
    %int1_319 = torch.constant.int 1
    %835 = torch.aten.slice.Tensor %832, %int0_317, %int0_318, %834, %int1_319 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %835, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_320 = torch.constant.int 1
    %int0_321 = torch.constant.int 0
    %int9223372036854775807_322 = torch.constant.int 9223372036854775807
    %int1_323 = torch.constant.int 1
    %836 = torch.aten.slice.Tensor %835, %int1_320, %int0_321, %int9223372036854775807_322, %int1_323 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %836, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_324 = torch.constant.int 1
    %int0_325 = torch.constant.int 0
    %int9223372036854775807_326 = torch.constant.int 9223372036854775807
    %int1_327 = torch.constant.int 1
    %837 = torch.aten.slice.Tensor %836, %int1_324, %int0_325, %int9223372036854775807_326, %int1_327 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %837, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_328 = torch.constant.int 0
    %838 = torch.aten.unsqueeze %837, %int0_328 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %838, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_329 = torch.constant.int 1
    %int0_330 = torch.constant.int 0
    %int9223372036854775807_331 = torch.constant.int 9223372036854775807
    %int1_332 = torch.constant.int 1
    %839 = torch.aten.slice.Tensor %838, %int1_329, %int0_330, %int9223372036854775807_331, %int1_332 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %839, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_333 = torch.constant.int 2
    %int0_334 = torch.constant.int 0
    %int9223372036854775807_335 = torch.constant.int 9223372036854775807
    %int1_336 = torch.constant.int 1
    %840 = torch.aten.slice.Tensor %839, %int2_333, %int0_334, %int9223372036854775807_335, %int1_336 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %840, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_337 = torch.constant.int 1
    %int1_338 = torch.constant.int 1
    %int1_339 = torch.constant.int 1
    %841 = torch.prim.ListConstruct %int1_337, %int1_338, %int1_339 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %842 = torch.aten.repeat %840, %841 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %842, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_340 = torch.constant.int 6
    %843 = torch.prims.convert_element_type %816, %int6_340 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %843, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %844 = torch_c.to_builtin_tensor %843 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %845 = torch_c.to_builtin_tensor %842 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %846 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%844, %845) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %847 = torch_c.from_builtin_tensor %846 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %847, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_341 = torch.constant.int 15
    %848 = torch.prims.convert_element_type %847, %int15_341 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %848, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_342 = torch.constant.int 131072
    %none_343 = torch.constant.none
    %none_344 = torch.constant.none
    %cpu_345 = torch.constant.device "cpu"
    %false_346 = torch.constant.bool false
    %849 = torch.aten.arange %int131072_342, %none_343, %none_344, %cpu_345, %false_346 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_347 = torch.constant.int 0
    %int128_348 = torch.constant.int 128
    %none_349 = torch.constant.none
    %none_350 = torch.constant.none
    %cpu_351 = torch.constant.device "cpu"
    %false_352 = torch.constant.bool false
    %850 = torch.aten.arange.start %int0_347, %int128_348, %none_349, %none_350, %cpu_351, %false_352 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_353 = torch.constant.int 2
    %851 = torch.aten.floor_divide.Scalar %850, %int2_353 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_354 = torch.constant.int 6
    %852 = torch.prims.convert_element_type %851, %int6_354 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_355 = torch.constant.int 128
    %853 = torch.aten.div.Scalar %852, %int128_355 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_356 = torch.constant.float 2.000000e+00
    %854 = torch.aten.mul.Scalar %853, %float2.000000e00_356 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_357 = torch.constant.float 5.000000e+05
    %855 = torch.aten.pow.Scalar %float5.000000e05_357, %854 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %856 = torch.aten.reciprocal %855 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_358 = torch.constant.float 1.000000e+00
    %857 = torch.aten.mul.Scalar %856, %float1.000000e00_358 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_359 = torch.constant.int 131072
    %int1_360 = torch.constant.int 1
    %858 = torch.prim.ListConstruct %int131072_359, %int1_360 : (!torch.int, !torch.int) -> !torch.list<int>
    %859 = torch.aten.view %849, %858 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %860 = torch.aten.mul.Tensor %859, %857 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_361 = torch.constant.int 1
    %861 = torch.aten.size.int %803, %int1_361 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_362 = torch.constant.int 0
    %862 = torch.aten.add.int %int0_362, %861 : !torch.int, !torch.int -> !torch.int
    %int0_363 = torch.constant.int 0
    %int0_364 = torch.constant.int 0
    %int1_365 = torch.constant.int 1
    %863 = torch.aten.slice.Tensor %860, %int0_363, %int0_364, %862, %int1_365 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %863, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_366 = torch.constant.int 1
    %int0_367 = torch.constant.int 0
    %int9223372036854775807_368 = torch.constant.int 9223372036854775807
    %int1_369 = torch.constant.int 1
    %864 = torch.aten.slice.Tensor %863, %int1_366, %int0_367, %int9223372036854775807_368, %int1_369 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %864, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_370 = torch.constant.int 1
    %int0_371 = torch.constant.int 0
    %int9223372036854775807_372 = torch.constant.int 9223372036854775807
    %int1_373 = torch.constant.int 1
    %865 = torch.aten.slice.Tensor %864, %int1_370, %int0_371, %int9223372036854775807_372, %int1_373 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %865, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_374 = torch.constant.int 0
    %866 = torch.aten.unsqueeze %865, %int0_374 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %866, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_375 = torch.constant.int 1
    %int0_376 = torch.constant.int 0
    %int9223372036854775807_377 = torch.constant.int 9223372036854775807
    %int1_378 = torch.constant.int 1
    %867 = torch.aten.slice.Tensor %866, %int1_375, %int0_376, %int9223372036854775807_377, %int1_378 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %867, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_379 = torch.constant.int 2
    %int0_380 = torch.constant.int 0
    %int9223372036854775807_381 = torch.constant.int 9223372036854775807
    %int1_382 = torch.constant.int 1
    %868 = torch.aten.slice.Tensor %867, %int2_379, %int0_380, %int9223372036854775807_381, %int1_382 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %868, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_383 = torch.constant.int 1
    %int1_384 = torch.constant.int 1
    %int1_385 = torch.constant.int 1
    %869 = torch.prim.ListConstruct %int1_383, %int1_384, %int1_385 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %870 = torch.aten.repeat %868, %869 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %870, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_386 = torch.constant.int 6
    %871 = torch.prims.convert_element_type %818, %int6_386 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %871, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %872 = torch_c.to_builtin_tensor %871 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %873 = torch_c.to_builtin_tensor %870 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %874 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%872, %873) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %875 = torch_c.from_builtin_tensor %874 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %875, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_387 = torch.constant.int 15
    %876 = torch.prims.convert_element_type %875, %int15_387 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %876, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %877 = torch.aten.div.Tensor %876, %25 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %877, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_388 = torch.constant.float -2.400000e+02
    %float2.400000e02_389 = torch.constant.float 2.400000e+02
    %878 = torch.aten.clamp %877, %float-2.400000e02_388, %float2.400000e02_389 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %878, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_390 = torch.constant.int 26
    %879 = torch.prims.convert_element_type %878, %int26_390 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %879, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %880 = torch.aten.div.Tensor %820, %25 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %880, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_391 = torch.constant.float -2.400000e+02
    %float2.400000e02_392 = torch.constant.float 2.400000e+02
    %881 = torch.aten.clamp %880, %float-2.400000e02_391, %float2.400000e02_392 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %881, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_393 = torch.constant.int 26
    %882 = torch.prims.convert_element_type %881, %int26_393 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %882, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_394 = torch.constant.int 64
    %883 = torch.aten.mul.Scalar %arg2, %int64_394 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %883, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int2_395 = torch.constant.int 2
    %int1_396 = torch.constant.int 1
    %884 = torch.aten.add.Scalar %883, %int2_395, %int1_396 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %884, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_397 = torch.constant.int 1
    %int32_398 = torch.constant.int 32
    %int8_399 = torch.constant.int 8
    %int128_400 = torch.constant.int 128
    %885 = torch.prim.ListConstruct %int1_397, %670, %int32_398, %int8_399, %int128_400 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %886 = torch.aten.view %879, %885 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %886, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_401 = torch.constant.int 32
    %int8_402 = torch.constant.int 8
    %int128_403 = torch.constant.int 128
    %887 = torch.prim.ListConstruct %670, %int32_401, %int8_402, %int128_403 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %888 = torch.aten.view %886, %887 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %888, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %889 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %890 = torch.aten.view %884, %889 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %890, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_404 = torch.constant.int 32
    %int2_405 = torch.constant.int 2
    %int32_406 = torch.constant.int 32
    %int8_407 = torch.constant.int 8
    %int128_408 = torch.constant.int 128
    %891 = torch.prim.ListConstruct %661, %int32_404, %int2_405, %int32_406, %int8_407, %int128_408 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %892 = torch.aten.view %699, %891 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %892, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_409 = torch.constant.int 32
    %893 = torch.aten.mul.int %661, %int32_409 : !torch.int, !torch.int -> !torch.int
    %int2_410 = torch.constant.int 2
    %894 = torch.aten.mul.int %893, %int2_410 : !torch.int, !torch.int -> !torch.int
    %int32_411 = torch.constant.int 32
    %int8_412 = torch.constant.int 8
    %int128_413 = torch.constant.int 128
    %895 = torch.prim.ListConstruct %894, %int32_411, %int8_412, %int128_413 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %896 = torch.aten.view %892, %895 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %896, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %897 = torch.prim.ListConstruct %890 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_414 = torch.constant.bool false
    %898 = torch.aten.index_put %896, %897, %888, %false_414 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %898, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_415 = torch.constant.int 32
    %int2_416 = torch.constant.int 2
    %int32_417 = torch.constant.int 32
    %int8_418 = torch.constant.int 8
    %int128_419 = torch.constant.int 128
    %899 = torch.prim.ListConstruct %661, %int32_415, %int2_416, %int32_417, %int8_418, %int128_419 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %900 = torch.aten.view %898, %899 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %900, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_420 = torch.constant.int 2097152
    %901 = torch.prim.ListConstruct %661, %int2097152_420 : (!torch.int, !torch.int) -> !torch.list<int>
    %902 = torch.aten.view %900, %901 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %902, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_421 = torch.constant.int 32
    %int2_422 = torch.constant.int 2
    %int32_423 = torch.constant.int 32
    %int8_424 = torch.constant.int 8
    %int128_425 = torch.constant.int 128
    %903 = torch.prim.ListConstruct %661, %int32_421, %int2_422, %int32_423, %int8_424, %int128_425 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %904 = torch.aten.view %902, %903 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %904, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_426 = torch.constant.int 32
    %int8_427 = torch.constant.int 8
    %int128_428 = torch.constant.int 128
    %905 = torch.prim.ListConstruct %894, %int32_426, %int8_427, %int128_428 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %906 = torch.aten.view %904, %905 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %906, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_429 = torch.constant.int 1
    %int32_430 = torch.constant.int 32
    %int8_431 = torch.constant.int 8
    %int128_432 = torch.constant.int 128
    %907 = torch.prim.ListConstruct %int1_429, %670, %int32_430, %int8_431, %int128_432 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %908 = torch.aten.view %882, %907 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %908, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_433 = torch.constant.int 32
    %int8_434 = torch.constant.int 8
    %int128_435 = torch.constant.int 128
    %909 = torch.prim.ListConstruct %670, %int32_433, %int8_434, %int128_435 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %910 = torch.aten.view %908, %909 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %910, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_436 = torch.constant.int 1
    %int1_437 = torch.constant.int 1
    %911 = torch.aten.add.Scalar %884, %int1_436, %int1_437 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %911, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %912 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %913 = torch.aten.view %911, %912 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %913, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %914 = torch.prim.ListConstruct %913 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_438 = torch.constant.bool false
    %915 = torch.aten.index_put %906, %914, %910, %false_438 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %915, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_439 = torch.constant.int 32
    %int2_440 = torch.constant.int 2
    %int32_441 = torch.constant.int 32
    %int8_442 = torch.constant.int 8
    %int128_443 = torch.constant.int 128
    %916 = torch.prim.ListConstruct %661, %int32_439, %int2_440, %int32_441, %int8_442, %int128_443 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %917 = torch.aten.view %915, %916 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %917, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_444 = torch.constant.int 2097152
    %918 = torch.prim.ListConstruct %661, %int2097152_444 : (!torch.int, !torch.int) -> !torch.list<int>
    %919 = torch.aten.view %917, %918 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %919, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_445 = torch.constant.int -2
    %920 = torch.aten.unsqueeze %879, %int-2_445 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %920, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_446 = torch.constant.int 1
    %int8_447 = torch.constant.int 8
    %int4_448 = torch.constant.int 4
    %int128_449 = torch.constant.int 128
    %921 = torch.prim.ListConstruct %int1_446, %861, %int8_447, %int4_448, %int128_449 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_450 = torch.constant.bool false
    %922 = torch.aten.expand %920, %921, %false_450 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %922, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_451 = torch.constant.int 0
    %923 = torch.aten.clone %922, %int0_451 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %923, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_452 = torch.constant.int 1
    %int32_453 = torch.constant.int 32
    %int128_454 = torch.constant.int 128
    %924 = torch.prim.ListConstruct %int1_452, %861, %int32_453, %int128_454 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %925 = torch.aten._unsafe_view %923, %924 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %925, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_455 = torch.constant.int -2
    %926 = torch.aten.unsqueeze %882, %int-2_455 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %926, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_456 = torch.constant.int 1
    %927 = torch.aten.size.int %813, %int1_456 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_457 = torch.constant.int 1
    %int8_458 = torch.constant.int 8
    %int4_459 = torch.constant.int 4
    %int128_460 = torch.constant.int 128
    %928 = torch.prim.ListConstruct %int1_457, %927, %int8_458, %int4_459, %int128_460 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_461 = torch.constant.bool false
    %929 = torch.aten.expand %926, %928, %false_461 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %929, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_462 = torch.constant.int 0
    %930 = torch.aten.clone %929, %int0_462 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %930, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_463 = torch.constant.int 1
    %int32_464 = torch.constant.int 32
    %int128_465 = torch.constant.int 128
    %931 = torch.prim.ListConstruct %int1_463, %927, %int32_464, %int128_465 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %932 = torch.aten._unsafe_view %930, %931 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %932, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_466 = torch.constant.int 6
    %933 = torch.prims.convert_element_type %925, %int6_466 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %933, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %934 = torch.aten.mul.Tensor %933, %25 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %934, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_467 = torch.constant.int 15
    %935 = torch.prims.convert_element_type %934, %int15_467 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %935, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_468 = torch.constant.int 6
    %936 = torch.prims.convert_element_type %932, %int6_468 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %936, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %937 = torch.aten.mul.Tensor %936, %25 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %937, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_469 = torch.constant.int 15
    %938 = torch.prims.convert_element_type %937, %int15_469 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %938, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_470 = torch.constant.int 1
    %int2_471 = torch.constant.int 2
    %939 = torch.aten.transpose.int %848, %int1_470, %int2_471 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %939, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_472 = torch.constant.int 1
    %int2_473 = torch.constant.int 2
    %940 = torch.aten.transpose.int %935, %int1_472, %int2_473 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %940, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_474 = torch.constant.int 1
    %int2_475 = torch.constant.int 2
    %941 = torch.aten.transpose.int %938, %int1_474, %int2_475 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %941, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_476 = torch.constant.float 0.000000e+00
    %true_477 = torch.constant.bool true
    %none_478 = torch.constant.none
    %none_479 = torch.constant.none
    %942:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%939, %940, %941, %float0.000000e00_476, %true_477, %none_478, %none_479) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %942#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_480 = torch.constant.int 1
    %int2_481 = torch.constant.int 2
    %943 = torch.aten.transpose.int %942#0, %int1_480, %int2_481 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %943, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_482 = torch.constant.int 1
    %int4096_483 = torch.constant.int 4096
    %944 = torch.prim.ListConstruct %int1_482, %833, %int4096_483 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %945 = torch.aten.view %943, %944 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %945, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %946 = torch.aten.div.Tensor %945, %26 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %946, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_484 = torch.constant.float -2.400000e+02
    %float2.400000e02_485 = torch.constant.float 2.400000e+02
    %947 = torch.aten.clamp %946, %float-2.400000e02_484, %float2.400000e02_485 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %947, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_486 = torch.constant.int 26
    %948 = torch.prims.convert_element_type %947, %int26_486 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %948, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_487 = torch.constant.int -2
    %int-1_488 = torch.constant.int -1
    %949 = torch.aten.transpose.int %27, %int-2_487, %int-1_488 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_489 = torch.constant.int 4096
    %950 = torch.prim.ListConstruct %833, %int4096_489 : (!torch.int, !torch.int) -> !torch.list<int>
    %951 = torch.aten.view %948, %950 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %951, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %952 = torch.aten.mm %951, %949 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %952, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_490 = torch.constant.int 1
    %int4096_491 = torch.constant.int 4096
    %953 = torch.prim.ListConstruct %int1_490, %833, %int4096_491 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %954 = torch.aten.view %952, %953 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %954, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_492 = torch.constant.int 15
    %955 = torch.prims.convert_element_type %954, %int15_492 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %955, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_493 = torch.constant.int 1
    %956 = torch.aten.add.Tensor %777, %955, %int1_493 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %956, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_494 = torch.constant.int 2
    %957 = torch.aten.pow.Tensor_Scalar %956, %int2_494 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %957, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_495 = torch.constant.int -1
    %958 = torch.prim.ListConstruct %int-1_495 : (!torch.int) -> !torch.list<int>
    %true_496 = torch.constant.bool true
    %none_497 = torch.constant.none
    %959 = torch.aten.mean.dim %957, %958, %true_496, %none_497 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %959, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_498 = torch.constant.float 1.000000e-05
    %int1_499 = torch.constant.int 1
    %960 = torch.aten.add.Scalar %959, %float1.000000e-05_498, %int1_499 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %960, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %961 = torch.aten.rsqrt %960 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %961, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %962 = torch.aten.mul.Tensor %956, %961 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %962, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %963 = torch.aten.mul.Tensor %28, %962 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %963, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %964 = torch.aten.div.Tensor %963, %29 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %964, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_500 = torch.constant.float -2.400000e+02
    %float2.400000e02_501 = torch.constant.float 2.400000e+02
    %965 = torch.aten.clamp %964, %float-2.400000e02_500, %float2.400000e02_501 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %965, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_502 = torch.constant.int 26
    %966 = torch.prims.convert_element_type %965, %int26_502 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %966, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_503 = torch.constant.int -2
    %int-1_504 = torch.constant.int -1
    %967 = torch.aten.transpose.int %30, %int-2_503, %int-1_504 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_505 = torch.constant.int 4096
    %968 = torch.prim.ListConstruct %566, %int4096_505 : (!torch.int, !torch.int) -> !torch.list<int>
    %969 = torch.aten.view %966, %968 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %969, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %970 = torch.aten.mm %969, %967 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %970, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_506 = torch.constant.int 1
    %int14336_507 = torch.constant.int 14336
    %971 = torch.prim.ListConstruct %int1_506, %566, %int14336_507 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %972 = torch.aten.view %970, %971 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %972, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_508 = torch.constant.int 15
    %973 = torch.prims.convert_element_type %972, %int15_508 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %973, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %974 = torch.aten.silu %973 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %974, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %975 = torch.aten.div.Tensor %963, %31 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %975, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_509 = torch.constant.float -2.400000e+02
    %float2.400000e02_510 = torch.constant.float 2.400000e+02
    %976 = torch.aten.clamp %975, %float-2.400000e02_509, %float2.400000e02_510 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %976, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_511 = torch.constant.int 26
    %977 = torch.prims.convert_element_type %976, %int26_511 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %977, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_512 = torch.constant.int -2
    %int-1_513 = torch.constant.int -1
    %978 = torch.aten.transpose.int %32, %int-2_512, %int-1_513 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_514 = torch.constant.int 4096
    %979 = torch.prim.ListConstruct %566, %int4096_514 : (!torch.int, !torch.int) -> !torch.list<int>
    %980 = torch.aten.view %977, %979 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %980, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %981 = torch.aten.mm %980, %978 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %981, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_515 = torch.constant.int 1
    %int14336_516 = torch.constant.int 14336
    %982 = torch.prim.ListConstruct %int1_515, %566, %int14336_516 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %983 = torch.aten.view %981, %982 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %983, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_517 = torch.constant.int 15
    %984 = torch.prims.convert_element_type %983, %int15_517 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %984, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %985 = torch.aten.mul.Tensor %974, %984 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %985, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %986 = torch.aten.div.Tensor %985, %33 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %986, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_518 = torch.constant.float -2.400000e+02
    %float2.400000e02_519 = torch.constant.float 2.400000e+02
    %987 = torch.aten.clamp %986, %float-2.400000e02_518, %float2.400000e02_519 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %987, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_520 = torch.constant.int 26
    %988 = torch.prims.convert_element_type %987, %int26_520 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %988, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_521 = torch.constant.int -2
    %int-1_522 = torch.constant.int -1
    %989 = torch.aten.transpose.int %34, %int-2_521, %int-1_522 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_523 = torch.constant.int 1
    %990 = torch.aten.size.int %972, %int1_523 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_524 = torch.constant.int 14336
    %991 = torch.prim.ListConstruct %990, %int14336_524 : (!torch.int, !torch.int) -> !torch.list<int>
    %992 = torch.aten.view %988, %991 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %992, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %993 = torch.aten.mm %992, %989 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %993, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_525 = torch.constant.int 1
    %int4096_526 = torch.constant.int 4096
    %994 = torch.prim.ListConstruct %int1_525, %990, %int4096_526 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %995 = torch.aten.view %993, %994 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %995, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_527 = torch.constant.int 15
    %996 = torch.prims.convert_element_type %995, %int15_527 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %996, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_528 = torch.constant.int 1
    %997 = torch.aten.add.Tensor %956, %996, %int1_528 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %997, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_529 = torch.constant.int 2
    %998 = torch.aten.pow.Tensor_Scalar %997, %int2_529 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %998, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_530 = torch.constant.int -1
    %999 = torch.prim.ListConstruct %int-1_530 : (!torch.int) -> !torch.list<int>
    %true_531 = torch.constant.bool true
    %none_532 = torch.constant.none
    %1000 = torch.aten.mean.dim %998, %999, %true_531, %none_532 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1000, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_533 = torch.constant.float 1.000000e-05
    %int1_534 = torch.constant.int 1
    %1001 = torch.aten.add.Scalar %1000, %float1.000000e-05_533, %int1_534 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1001, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1002 = torch.aten.rsqrt %1001 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1002, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1003 = torch.aten.mul.Tensor %997, %1002 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1003, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1004 = torch.aten.mul.Tensor %35, %1003 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1004, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1005 = torch.aten.div.Tensor %1004, %36 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1005, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_535 = torch.constant.float -2.400000e+02
    %float2.400000e02_536 = torch.constant.float 2.400000e+02
    %1006 = torch.aten.clamp %1005, %float-2.400000e02_535, %float2.400000e02_536 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1006, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_537 = torch.constant.int 26
    %1007 = torch.prims.convert_element_type %1006, %int26_537 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1007, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_538 = torch.constant.int -2
    %int-1_539 = torch.constant.int -1
    %1008 = torch.aten.transpose.int %37, %int-2_538, %int-1_539 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_540 = torch.constant.int 4096
    %1009 = torch.prim.ListConstruct %566, %int4096_540 : (!torch.int, !torch.int) -> !torch.list<int>
    %1010 = torch.aten.view %1007, %1009 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1010, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1011 = torch.aten.mm %1010, %1008 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1011, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_541 = torch.constant.int 1
    %int4096_542 = torch.constant.int 4096
    %1012 = torch.prim.ListConstruct %int1_541, %566, %int4096_542 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1013 = torch.aten.view %1011, %1012 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1013, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_543 = torch.constant.int 15
    %1014 = torch.prims.convert_element_type %1013, %int15_543 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1014, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1015 = torch.aten.div.Tensor %1004, %38 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1015, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_544 = torch.constant.float -2.400000e+02
    %float2.400000e02_545 = torch.constant.float 2.400000e+02
    %1016 = torch.aten.clamp %1015, %float-2.400000e02_544, %float2.400000e02_545 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1016, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_546 = torch.constant.int 26
    %1017 = torch.prims.convert_element_type %1016, %int26_546 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1017, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_547 = torch.constant.int -2
    %int-1_548 = torch.constant.int -1
    %1018 = torch.aten.transpose.int %39, %int-2_547, %int-1_548 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_549 = torch.constant.int 4096
    %1019 = torch.prim.ListConstruct %566, %int4096_549 : (!torch.int, !torch.int) -> !torch.list<int>
    %1020 = torch.aten.view %1017, %1019 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1020, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1021 = torch.aten.mm %1020, %1018 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1021, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_550 = torch.constant.int 1
    %int1024_551 = torch.constant.int 1024
    %1022 = torch.prim.ListConstruct %int1_550, %566, %int1024_551 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1023 = torch.aten.view %1021, %1022 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1023, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_552 = torch.constant.int 15
    %1024 = torch.prims.convert_element_type %1023, %int15_552 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1024, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1025 = torch.aten.div.Tensor %1004, %40 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1025, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_553 = torch.constant.float -2.400000e+02
    %float2.400000e02_554 = torch.constant.float 2.400000e+02
    %1026 = torch.aten.clamp %1025, %float-2.400000e02_553, %float2.400000e02_554 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1026, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_555 = torch.constant.int 26
    %1027 = torch.prims.convert_element_type %1026, %int26_555 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1027, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_556 = torch.constant.int -2
    %int-1_557 = torch.constant.int -1
    %1028 = torch.aten.transpose.int %41, %int-2_556, %int-1_557 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_558 = torch.constant.int 4096
    %1029 = torch.prim.ListConstruct %566, %int4096_558 : (!torch.int, !torch.int) -> !torch.list<int>
    %1030 = torch.aten.view %1027, %1029 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1030, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1031 = torch.aten.mm %1030, %1028 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1031, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_559 = torch.constant.int 1
    %int1024_560 = torch.constant.int 1024
    %1032 = torch.prim.ListConstruct %int1_559, %566, %int1024_560 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1033 = torch.aten.view %1031, %1032 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1033, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_561 = torch.constant.int 15
    %1034 = torch.prims.convert_element_type %1033, %int15_561 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1034, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_562 = torch.constant.int 1
    %int32_563 = torch.constant.int 32
    %int128_564 = torch.constant.int 128
    %1035 = torch.prim.ListConstruct %int1_562, %566, %int32_563, %int128_564 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1036 = torch.aten.view %1014, %1035 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1036, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_565 = torch.constant.int 1
    %int8_566 = torch.constant.int 8
    %int128_567 = torch.constant.int 128
    %1037 = torch.prim.ListConstruct %int1_565, %566, %int8_566, %int128_567 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1038 = torch.aten.view %1024, %1037 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1038, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_568 = torch.constant.int 1
    %int8_569 = torch.constant.int 8
    %int128_570 = torch.constant.int 128
    %1039 = torch.prim.ListConstruct %int1_568, %566, %int8_569, %int128_570 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1040 = torch.aten.view %1034, %1039 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1040, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_571 = torch.constant.int 131072
    %none_572 = torch.constant.none
    %none_573 = torch.constant.none
    %cpu_574 = torch.constant.device "cpu"
    %false_575 = torch.constant.bool false
    %1041 = torch.aten.arange %int131072_571, %none_572, %none_573, %cpu_574, %false_575 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_576 = torch.constant.int 0
    %int128_577 = torch.constant.int 128
    %none_578 = torch.constant.none
    %none_579 = torch.constant.none
    %cpu_580 = torch.constant.device "cpu"
    %false_581 = torch.constant.bool false
    %1042 = torch.aten.arange.start %int0_576, %int128_577, %none_578, %none_579, %cpu_580, %false_581 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_582 = torch.constant.int 2
    %1043 = torch.aten.floor_divide.Scalar %1042, %int2_582 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_583 = torch.constant.int 6
    %1044 = torch.prims.convert_element_type %1043, %int6_583 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_584 = torch.constant.int 128
    %1045 = torch.aten.div.Scalar %1044, %int128_584 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_585 = torch.constant.float 2.000000e+00
    %1046 = torch.aten.mul.Scalar %1045, %float2.000000e00_585 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_586 = torch.constant.float 5.000000e+05
    %1047 = torch.aten.pow.Scalar %float5.000000e05_586, %1046 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1048 = torch.aten.reciprocal %1047 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_587 = torch.constant.float 1.000000e+00
    %1049 = torch.aten.mul.Scalar %1048, %float1.000000e00_587 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_588 = torch.constant.int 131072
    %int1_589 = torch.constant.int 1
    %1050 = torch.prim.ListConstruct %int131072_588, %int1_589 : (!torch.int, !torch.int) -> !torch.list<int>
    %1051 = torch.aten.view %1041, %1050 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1052 = torch.aten.mul.Tensor %1051, %1049 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_590 = torch.constant.int 1
    %1053 = torch.aten.size.int %1013, %int1_590 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_591 = torch.constant.int 0
    %1054 = torch.aten.add.int %int0_591, %1053 : !torch.int, !torch.int -> !torch.int
    %int0_592 = torch.constant.int 0
    %int0_593 = torch.constant.int 0
    %int1_594 = torch.constant.int 1
    %1055 = torch.aten.slice.Tensor %1052, %int0_592, %int0_593, %1054, %int1_594 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1055, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_595 = torch.constant.int 1
    %int0_596 = torch.constant.int 0
    %int9223372036854775807_597 = torch.constant.int 9223372036854775807
    %int1_598 = torch.constant.int 1
    %1056 = torch.aten.slice.Tensor %1055, %int1_595, %int0_596, %int9223372036854775807_597, %int1_598 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1056, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_599 = torch.constant.int 1
    %int0_600 = torch.constant.int 0
    %int9223372036854775807_601 = torch.constant.int 9223372036854775807
    %int1_602 = torch.constant.int 1
    %1057 = torch.aten.slice.Tensor %1056, %int1_599, %int0_600, %int9223372036854775807_601, %int1_602 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1057, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_603 = torch.constant.int 0
    %1058 = torch.aten.unsqueeze %1057, %int0_603 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1058, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_604 = torch.constant.int 1
    %int0_605 = torch.constant.int 0
    %int9223372036854775807_606 = torch.constant.int 9223372036854775807
    %int1_607 = torch.constant.int 1
    %1059 = torch.aten.slice.Tensor %1058, %int1_604, %int0_605, %int9223372036854775807_606, %int1_607 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1059, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_608 = torch.constant.int 2
    %int0_609 = torch.constant.int 0
    %int9223372036854775807_610 = torch.constant.int 9223372036854775807
    %int1_611 = torch.constant.int 1
    %1060 = torch.aten.slice.Tensor %1059, %int2_608, %int0_609, %int9223372036854775807_610, %int1_611 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1060, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_612 = torch.constant.int 1
    %int1_613 = torch.constant.int 1
    %int1_614 = torch.constant.int 1
    %1061 = torch.prim.ListConstruct %int1_612, %int1_613, %int1_614 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1062 = torch.aten.repeat %1060, %1061 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1062, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_615 = torch.constant.int 6
    %1063 = torch.prims.convert_element_type %1036, %int6_615 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1063, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1064 = torch_c.to_builtin_tensor %1063 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %1065 = torch_c.to_builtin_tensor %1062 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1066 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1064, %1065) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %1067 = torch_c.from_builtin_tensor %1066 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1067, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_616 = torch.constant.int 15
    %1068 = torch.prims.convert_element_type %1067, %int15_616 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1068, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_617 = torch.constant.int 131072
    %none_618 = torch.constant.none
    %none_619 = torch.constant.none
    %cpu_620 = torch.constant.device "cpu"
    %false_621 = torch.constant.bool false
    %1069 = torch.aten.arange %int131072_617, %none_618, %none_619, %cpu_620, %false_621 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_622 = torch.constant.int 0
    %int128_623 = torch.constant.int 128
    %none_624 = torch.constant.none
    %none_625 = torch.constant.none
    %cpu_626 = torch.constant.device "cpu"
    %false_627 = torch.constant.bool false
    %1070 = torch.aten.arange.start %int0_622, %int128_623, %none_624, %none_625, %cpu_626, %false_627 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_628 = torch.constant.int 2
    %1071 = torch.aten.floor_divide.Scalar %1070, %int2_628 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_629 = torch.constant.int 6
    %1072 = torch.prims.convert_element_type %1071, %int6_629 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_630 = torch.constant.int 128
    %1073 = torch.aten.div.Scalar %1072, %int128_630 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_631 = torch.constant.float 2.000000e+00
    %1074 = torch.aten.mul.Scalar %1073, %float2.000000e00_631 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_632 = torch.constant.float 5.000000e+05
    %1075 = torch.aten.pow.Scalar %float5.000000e05_632, %1074 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1076 = torch.aten.reciprocal %1075 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_633 = torch.constant.float 1.000000e+00
    %1077 = torch.aten.mul.Scalar %1076, %float1.000000e00_633 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_634 = torch.constant.int 131072
    %int1_635 = torch.constant.int 1
    %1078 = torch.prim.ListConstruct %int131072_634, %int1_635 : (!torch.int, !torch.int) -> !torch.list<int>
    %1079 = torch.aten.view %1069, %1078 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1080 = torch.aten.mul.Tensor %1079, %1077 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_636 = torch.constant.int 1
    %1081 = torch.aten.size.int %1023, %int1_636 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_637 = torch.constant.int 0
    %1082 = torch.aten.add.int %int0_637, %1081 : !torch.int, !torch.int -> !torch.int
    %int0_638 = torch.constant.int 0
    %int0_639 = torch.constant.int 0
    %int1_640 = torch.constant.int 1
    %1083 = torch.aten.slice.Tensor %1080, %int0_638, %int0_639, %1082, %int1_640 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1083, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_641 = torch.constant.int 1
    %int0_642 = torch.constant.int 0
    %int9223372036854775807_643 = torch.constant.int 9223372036854775807
    %int1_644 = torch.constant.int 1
    %1084 = torch.aten.slice.Tensor %1083, %int1_641, %int0_642, %int9223372036854775807_643, %int1_644 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1084, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_645 = torch.constant.int 1
    %int0_646 = torch.constant.int 0
    %int9223372036854775807_647 = torch.constant.int 9223372036854775807
    %int1_648 = torch.constant.int 1
    %1085 = torch.aten.slice.Tensor %1084, %int1_645, %int0_646, %int9223372036854775807_647, %int1_648 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1085, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_649 = torch.constant.int 0
    %1086 = torch.aten.unsqueeze %1085, %int0_649 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1086, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_650 = torch.constant.int 1
    %int0_651 = torch.constant.int 0
    %int9223372036854775807_652 = torch.constant.int 9223372036854775807
    %int1_653 = torch.constant.int 1
    %1087 = torch.aten.slice.Tensor %1086, %int1_650, %int0_651, %int9223372036854775807_652, %int1_653 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1087, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_654 = torch.constant.int 2
    %int0_655 = torch.constant.int 0
    %int9223372036854775807_656 = torch.constant.int 9223372036854775807
    %int1_657 = torch.constant.int 1
    %1088 = torch.aten.slice.Tensor %1087, %int2_654, %int0_655, %int9223372036854775807_656, %int1_657 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1088, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_658 = torch.constant.int 1
    %int1_659 = torch.constant.int 1
    %int1_660 = torch.constant.int 1
    %1089 = torch.prim.ListConstruct %int1_658, %int1_659, %int1_660 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1090 = torch.aten.repeat %1088, %1089 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1090, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_661 = torch.constant.int 6
    %1091 = torch.prims.convert_element_type %1038, %int6_661 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1091, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %1092 = torch_c.to_builtin_tensor %1091 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %1093 = torch_c.to_builtin_tensor %1090 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1094 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1092, %1093) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %1095 = torch_c.from_builtin_tensor %1094 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1095, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_662 = torch.constant.int 15
    %1096 = torch.prims.convert_element_type %1095, %int15_662 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1096, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1097 = torch.aten.div.Tensor %1096, %42 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1097, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_663 = torch.constant.float -2.400000e+02
    %float2.400000e02_664 = torch.constant.float 2.400000e+02
    %1098 = torch.aten.clamp %1097, %float-2.400000e02_663, %float2.400000e02_664 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1098, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_665 = torch.constant.int 26
    %1099 = torch.prims.convert_element_type %1098, %int26_665 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1099, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1100 = torch.aten.div.Tensor %1040, %42 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1100, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_666 = torch.constant.float -2.400000e+02
    %float2.400000e02_667 = torch.constant.float 2.400000e+02
    %1101 = torch.aten.clamp %1100, %float-2.400000e02_666, %float2.400000e02_667 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1101, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_668 = torch.constant.int 26
    %1102 = torch.prims.convert_element_type %1101, %int26_668 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1102, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_669 = torch.constant.int 64
    %1103 = torch.aten.mul.Scalar %arg2, %int64_669 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1103, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int4_670 = torch.constant.int 4
    %int1_671 = torch.constant.int 1
    %1104 = torch.aten.add.Scalar %1103, %int4_670, %int1_671 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1104, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_672 = torch.constant.int 1
    %int32_673 = torch.constant.int 32
    %int8_674 = torch.constant.int 8
    %int128_675 = torch.constant.int 128
    %1105 = torch.prim.ListConstruct %int1_672, %670, %int32_673, %int8_674, %int128_675 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1106 = torch.aten.view %1099, %1105 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1106, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_676 = torch.constant.int 32
    %int8_677 = torch.constant.int 8
    %int128_678 = torch.constant.int 128
    %1107 = torch.prim.ListConstruct %670, %int32_676, %int8_677, %int128_678 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1108 = torch.aten.view %1106, %1107 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1108, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1109 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1110 = torch.aten.view %1104, %1109 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1110, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_679 = torch.constant.int 32
    %int2_680 = torch.constant.int 2
    %int32_681 = torch.constant.int 32
    %int8_682 = torch.constant.int 8
    %int128_683 = torch.constant.int 128
    %1111 = torch.prim.ListConstruct %661, %int32_679, %int2_680, %int32_681, %int8_682, %int128_683 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1112 = torch.aten.view %919, %1111 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1112, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_684 = torch.constant.int 32
    %1113 = torch.aten.mul.int %661, %int32_684 : !torch.int, !torch.int -> !torch.int
    %int2_685 = torch.constant.int 2
    %1114 = torch.aten.mul.int %1113, %int2_685 : !torch.int, !torch.int -> !torch.int
    %int32_686 = torch.constant.int 32
    %int8_687 = torch.constant.int 8
    %int128_688 = torch.constant.int 128
    %1115 = torch.prim.ListConstruct %1114, %int32_686, %int8_687, %int128_688 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1116 = torch.aten.view %1112, %1115 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1116, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %1117 = torch.prim.ListConstruct %1110 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_689 = torch.constant.bool false
    %1118 = torch.aten.index_put %1116, %1117, %1108, %false_689 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1118, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_690 = torch.constant.int 32
    %int2_691 = torch.constant.int 2
    %int32_692 = torch.constant.int 32
    %int8_693 = torch.constant.int 8
    %int128_694 = torch.constant.int 128
    %1119 = torch.prim.ListConstruct %661, %int32_690, %int2_691, %int32_692, %int8_693, %int128_694 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1120 = torch.aten.view %1118, %1119 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1120, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_695 = torch.constant.int 2097152
    %1121 = torch.prim.ListConstruct %661, %int2097152_695 : (!torch.int, !torch.int) -> !torch.list<int>
    %1122 = torch.aten.view %1120, %1121 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1122, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_696 = torch.constant.int 32
    %int2_697 = torch.constant.int 2
    %int32_698 = torch.constant.int 32
    %int8_699 = torch.constant.int 8
    %int128_700 = torch.constant.int 128
    %1123 = torch.prim.ListConstruct %661, %int32_696, %int2_697, %int32_698, %int8_699, %int128_700 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1124 = torch.aten.view %1122, %1123 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1124, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_701 = torch.constant.int 32
    %int8_702 = torch.constant.int 8
    %int128_703 = torch.constant.int 128
    %1125 = torch.prim.ListConstruct %1114, %int32_701, %int8_702, %int128_703 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1126 = torch.aten.view %1124, %1125 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1126, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_704 = torch.constant.int 1
    %int32_705 = torch.constant.int 32
    %int8_706 = torch.constant.int 8
    %int128_707 = torch.constant.int 128
    %1127 = torch.prim.ListConstruct %int1_704, %670, %int32_705, %int8_706, %int128_707 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1128 = torch.aten.view %1102, %1127 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1128, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_708 = torch.constant.int 32
    %int8_709 = torch.constant.int 8
    %int128_710 = torch.constant.int 128
    %1129 = torch.prim.ListConstruct %670, %int32_708, %int8_709, %int128_710 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1130 = torch.aten.view %1128, %1129 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1130, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_711 = torch.constant.int 1
    %int1_712 = torch.constant.int 1
    %1131 = torch.aten.add.Scalar %1104, %int1_711, %int1_712 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1131, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1132 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1133 = torch.aten.view %1131, %1132 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1133, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1134 = torch.prim.ListConstruct %1133 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_713 = torch.constant.bool false
    %1135 = torch.aten.index_put %1126, %1134, %1130, %false_713 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1135, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_714 = torch.constant.int 32
    %int2_715 = torch.constant.int 2
    %int32_716 = torch.constant.int 32
    %int8_717 = torch.constant.int 8
    %int128_718 = torch.constant.int 128
    %1136 = torch.prim.ListConstruct %661, %int32_714, %int2_715, %int32_716, %int8_717, %int128_718 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1137 = torch.aten.view %1135, %1136 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1137, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_719 = torch.constant.int 2097152
    %1138 = torch.prim.ListConstruct %661, %int2097152_719 : (!torch.int, !torch.int) -> !torch.list<int>
    %1139 = torch.aten.view %1137, %1138 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1139, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_720 = torch.constant.int -2
    %1140 = torch.aten.unsqueeze %1099, %int-2_720 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1140, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_721 = torch.constant.int 1
    %int8_722 = torch.constant.int 8
    %int4_723 = torch.constant.int 4
    %int128_724 = torch.constant.int 128
    %1141 = torch.prim.ListConstruct %int1_721, %1081, %int8_722, %int4_723, %int128_724 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_725 = torch.constant.bool false
    %1142 = torch.aten.expand %1140, %1141, %false_725 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1142, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_726 = torch.constant.int 0
    %1143 = torch.aten.clone %1142, %int0_726 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1143, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_727 = torch.constant.int 1
    %int32_728 = torch.constant.int 32
    %int128_729 = torch.constant.int 128
    %1144 = torch.prim.ListConstruct %int1_727, %1081, %int32_728, %int128_729 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1145 = torch.aten._unsafe_view %1143, %1144 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1145, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_730 = torch.constant.int -2
    %1146 = torch.aten.unsqueeze %1102, %int-2_730 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1146, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_731 = torch.constant.int 1
    %1147 = torch.aten.size.int %1033, %int1_731 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_732 = torch.constant.int 1
    %int8_733 = torch.constant.int 8
    %int4_734 = torch.constant.int 4
    %int128_735 = torch.constant.int 128
    %1148 = torch.prim.ListConstruct %int1_732, %1147, %int8_733, %int4_734, %int128_735 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_736 = torch.constant.bool false
    %1149 = torch.aten.expand %1146, %1148, %false_736 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1149, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_737 = torch.constant.int 0
    %1150 = torch.aten.clone %1149, %int0_737 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1150, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_738 = torch.constant.int 1
    %int32_739 = torch.constant.int 32
    %int128_740 = torch.constant.int 128
    %1151 = torch.prim.ListConstruct %int1_738, %1147, %int32_739, %int128_740 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1152 = torch.aten._unsafe_view %1150, %1151 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1152, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_741 = torch.constant.int 6
    %1153 = torch.prims.convert_element_type %1145, %int6_741 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1153, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1154 = torch.aten.mul.Tensor %1153, %42 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1154, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_742 = torch.constant.int 15
    %1155 = torch.prims.convert_element_type %1154, %int15_742 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1155, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_743 = torch.constant.int 6
    %1156 = torch.prims.convert_element_type %1152, %int6_743 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1156, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1157 = torch.aten.mul.Tensor %1156, %42 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1157, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_744 = torch.constant.int 15
    %1158 = torch.prims.convert_element_type %1157, %int15_744 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1158, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_745 = torch.constant.int 1
    %int2_746 = torch.constant.int 2
    %1159 = torch.aten.transpose.int %1068, %int1_745, %int2_746 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1159, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_747 = torch.constant.int 1
    %int2_748 = torch.constant.int 2
    %1160 = torch.aten.transpose.int %1155, %int1_747, %int2_748 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1160, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_749 = torch.constant.int 1
    %int2_750 = torch.constant.int 2
    %1161 = torch.aten.transpose.int %1158, %int1_749, %int2_750 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1161, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_751 = torch.constant.float 0.000000e+00
    %true_752 = torch.constant.bool true
    %none_753 = torch.constant.none
    %none_754 = torch.constant.none
    %1162:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1159, %1160, %1161, %float0.000000e00_751, %true_752, %none_753, %none_754) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %1162#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_755 = torch.constant.int 1
    %int2_756 = torch.constant.int 2
    %1163 = torch.aten.transpose.int %1162#0, %int1_755, %int2_756 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1163, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_757 = torch.constant.int 1
    %int4096_758 = torch.constant.int 4096
    %1164 = torch.prim.ListConstruct %int1_757, %1053, %int4096_758 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1165 = torch.aten.view %1163, %1164 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1165, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1166 = torch.aten.div.Tensor %1165, %43 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1166, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_759 = torch.constant.float -2.400000e+02
    %float2.400000e02_760 = torch.constant.float 2.400000e+02
    %1167 = torch.aten.clamp %1166, %float-2.400000e02_759, %float2.400000e02_760 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1167, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_761 = torch.constant.int 26
    %1168 = torch.prims.convert_element_type %1167, %int26_761 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1168, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_762 = torch.constant.int -2
    %int-1_763 = torch.constant.int -1
    %1169 = torch.aten.transpose.int %44, %int-2_762, %int-1_763 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_764 = torch.constant.int 4096
    %1170 = torch.prim.ListConstruct %1053, %int4096_764 : (!torch.int, !torch.int) -> !torch.list<int>
    %1171 = torch.aten.view %1168, %1170 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1171, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1172 = torch.aten.mm %1171, %1169 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1172, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_765 = torch.constant.int 1
    %int4096_766 = torch.constant.int 4096
    %1173 = torch.prim.ListConstruct %int1_765, %1053, %int4096_766 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1174 = torch.aten.view %1172, %1173 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1174, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_767 = torch.constant.int 15
    %1175 = torch.prims.convert_element_type %1174, %int15_767 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1175, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_768 = torch.constant.int 1
    %1176 = torch.aten.add.Tensor %997, %1175, %int1_768 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1176, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_769 = torch.constant.int 2
    %1177 = torch.aten.pow.Tensor_Scalar %1176, %int2_769 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1177, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_770 = torch.constant.int -1
    %1178 = torch.prim.ListConstruct %int-1_770 : (!torch.int) -> !torch.list<int>
    %true_771 = torch.constant.bool true
    %none_772 = torch.constant.none
    %1179 = torch.aten.mean.dim %1177, %1178, %true_771, %none_772 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1179, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_773 = torch.constant.float 1.000000e-05
    %int1_774 = torch.constant.int 1
    %1180 = torch.aten.add.Scalar %1179, %float1.000000e-05_773, %int1_774 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1180, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1181 = torch.aten.rsqrt %1180 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1181, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1182 = torch.aten.mul.Tensor %1176, %1181 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1182, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1183 = torch.aten.mul.Tensor %45, %1182 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1183, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1184 = torch.aten.div.Tensor %1183, %46 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1184, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_775 = torch.constant.float -2.400000e+02
    %float2.400000e02_776 = torch.constant.float 2.400000e+02
    %1185 = torch.aten.clamp %1184, %float-2.400000e02_775, %float2.400000e02_776 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1185, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_777 = torch.constant.int 26
    %1186 = torch.prims.convert_element_type %1185, %int26_777 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1186, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_778 = torch.constant.int -2
    %int-1_779 = torch.constant.int -1
    %1187 = torch.aten.transpose.int %47, %int-2_778, %int-1_779 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_780 = torch.constant.int 4096
    %1188 = torch.prim.ListConstruct %566, %int4096_780 : (!torch.int, !torch.int) -> !torch.list<int>
    %1189 = torch.aten.view %1186, %1188 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1189, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1190 = torch.aten.mm %1189, %1187 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1190, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_781 = torch.constant.int 1
    %int14336_782 = torch.constant.int 14336
    %1191 = torch.prim.ListConstruct %int1_781, %566, %int14336_782 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1192 = torch.aten.view %1190, %1191 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1192, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_783 = torch.constant.int 15
    %1193 = torch.prims.convert_element_type %1192, %int15_783 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1193, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1194 = torch.aten.silu %1193 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1194, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1195 = torch.aten.div.Tensor %1183, %48 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1195, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_784 = torch.constant.float -2.400000e+02
    %float2.400000e02_785 = torch.constant.float 2.400000e+02
    %1196 = torch.aten.clamp %1195, %float-2.400000e02_784, %float2.400000e02_785 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1196, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_786 = torch.constant.int 26
    %1197 = torch.prims.convert_element_type %1196, %int26_786 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1197, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_787 = torch.constant.int -2
    %int-1_788 = torch.constant.int -1
    %1198 = torch.aten.transpose.int %49, %int-2_787, %int-1_788 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_789 = torch.constant.int 4096
    %1199 = torch.prim.ListConstruct %566, %int4096_789 : (!torch.int, !torch.int) -> !torch.list<int>
    %1200 = torch.aten.view %1197, %1199 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1200, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1201 = torch.aten.mm %1200, %1198 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1201, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_790 = torch.constant.int 1
    %int14336_791 = torch.constant.int 14336
    %1202 = torch.prim.ListConstruct %int1_790, %566, %int14336_791 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1203 = torch.aten.view %1201, %1202 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1203, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_792 = torch.constant.int 15
    %1204 = torch.prims.convert_element_type %1203, %int15_792 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1204, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1205 = torch.aten.mul.Tensor %1194, %1204 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1205, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1206 = torch.aten.div.Tensor %1205, %50 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1206, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_793 = torch.constant.float -2.400000e+02
    %float2.400000e02_794 = torch.constant.float 2.400000e+02
    %1207 = torch.aten.clamp %1206, %float-2.400000e02_793, %float2.400000e02_794 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1207, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_795 = torch.constant.int 26
    %1208 = torch.prims.convert_element_type %1207, %int26_795 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1208, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_796 = torch.constant.int -2
    %int-1_797 = torch.constant.int -1
    %1209 = torch.aten.transpose.int %51, %int-2_796, %int-1_797 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_798 = torch.constant.int 1
    %1210 = torch.aten.size.int %1192, %int1_798 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_799 = torch.constant.int 14336
    %1211 = torch.prim.ListConstruct %1210, %int14336_799 : (!torch.int, !torch.int) -> !torch.list<int>
    %1212 = torch.aten.view %1208, %1211 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1212, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %1213 = torch.aten.mm %1212, %1209 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1213, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_800 = torch.constant.int 1
    %int4096_801 = torch.constant.int 4096
    %1214 = torch.prim.ListConstruct %int1_800, %1210, %int4096_801 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1215 = torch.aten.view %1213, %1214 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1215, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_802 = torch.constant.int 15
    %1216 = torch.prims.convert_element_type %1215, %int15_802 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1216, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_803 = torch.constant.int 1
    %1217 = torch.aten.add.Tensor %1176, %1216, %int1_803 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1217, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_804 = torch.constant.int 2
    %1218 = torch.aten.pow.Tensor_Scalar %1217, %int2_804 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1218, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_805 = torch.constant.int -1
    %1219 = torch.prim.ListConstruct %int-1_805 : (!torch.int) -> !torch.list<int>
    %true_806 = torch.constant.bool true
    %none_807 = torch.constant.none
    %1220 = torch.aten.mean.dim %1218, %1219, %true_806, %none_807 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1220, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_808 = torch.constant.float 1.000000e-05
    %int1_809 = torch.constant.int 1
    %1221 = torch.aten.add.Scalar %1220, %float1.000000e-05_808, %int1_809 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1221, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1222 = torch.aten.rsqrt %1221 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1222, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1223 = torch.aten.mul.Tensor %1217, %1222 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1223, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1224 = torch.aten.mul.Tensor %52, %1223 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1224, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1225 = torch.aten.div.Tensor %1224, %53 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1225, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_810 = torch.constant.float -2.400000e+02
    %float2.400000e02_811 = torch.constant.float 2.400000e+02
    %1226 = torch.aten.clamp %1225, %float-2.400000e02_810, %float2.400000e02_811 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1226, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_812 = torch.constant.int 26
    %1227 = torch.prims.convert_element_type %1226, %int26_812 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1227, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_813 = torch.constant.int -2
    %int-1_814 = torch.constant.int -1
    %1228 = torch.aten.transpose.int %54, %int-2_813, %int-1_814 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_815 = torch.constant.int 4096
    %1229 = torch.prim.ListConstruct %566, %int4096_815 : (!torch.int, !torch.int) -> !torch.list<int>
    %1230 = torch.aten.view %1227, %1229 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1230, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1231 = torch.aten.mm %1230, %1228 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1231, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_816 = torch.constant.int 1
    %int4096_817 = torch.constant.int 4096
    %1232 = torch.prim.ListConstruct %int1_816, %566, %int4096_817 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1233 = torch.aten.view %1231, %1232 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1233, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_818 = torch.constant.int 15
    %1234 = torch.prims.convert_element_type %1233, %int15_818 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1234, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1235 = torch.aten.div.Tensor %1224, %55 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1235, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_819 = torch.constant.float -2.400000e+02
    %float2.400000e02_820 = torch.constant.float 2.400000e+02
    %1236 = torch.aten.clamp %1235, %float-2.400000e02_819, %float2.400000e02_820 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1236, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_821 = torch.constant.int 26
    %1237 = torch.prims.convert_element_type %1236, %int26_821 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1237, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_822 = torch.constant.int -2
    %int-1_823 = torch.constant.int -1
    %1238 = torch.aten.transpose.int %56, %int-2_822, %int-1_823 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_824 = torch.constant.int 4096
    %1239 = torch.prim.ListConstruct %566, %int4096_824 : (!torch.int, !torch.int) -> !torch.list<int>
    %1240 = torch.aten.view %1237, %1239 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1240, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1241 = torch.aten.mm %1240, %1238 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1241, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_825 = torch.constant.int 1
    %int1024_826 = torch.constant.int 1024
    %1242 = torch.prim.ListConstruct %int1_825, %566, %int1024_826 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1243 = torch.aten.view %1241, %1242 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1243, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_827 = torch.constant.int 15
    %1244 = torch.prims.convert_element_type %1243, %int15_827 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1244, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1245 = torch.aten.div.Tensor %1224, %57 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1245, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_828 = torch.constant.float -2.400000e+02
    %float2.400000e02_829 = torch.constant.float 2.400000e+02
    %1246 = torch.aten.clamp %1245, %float-2.400000e02_828, %float2.400000e02_829 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1246, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_830 = torch.constant.int 26
    %1247 = torch.prims.convert_element_type %1246, %int26_830 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1247, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_831 = torch.constant.int -2
    %int-1_832 = torch.constant.int -1
    %1248 = torch.aten.transpose.int %58, %int-2_831, %int-1_832 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_833 = torch.constant.int 4096
    %1249 = torch.prim.ListConstruct %566, %int4096_833 : (!torch.int, !torch.int) -> !torch.list<int>
    %1250 = torch.aten.view %1247, %1249 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1250, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1251 = torch.aten.mm %1250, %1248 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1251, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_834 = torch.constant.int 1
    %int1024_835 = torch.constant.int 1024
    %1252 = torch.prim.ListConstruct %int1_834, %566, %int1024_835 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1253 = torch.aten.view %1251, %1252 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1253, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_836 = torch.constant.int 15
    %1254 = torch.prims.convert_element_type %1253, %int15_836 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1254, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_837 = torch.constant.int 1
    %int32_838 = torch.constant.int 32
    %int128_839 = torch.constant.int 128
    %1255 = torch.prim.ListConstruct %int1_837, %566, %int32_838, %int128_839 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1256 = torch.aten.view %1234, %1255 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1256, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_840 = torch.constant.int 1
    %int8_841 = torch.constant.int 8
    %int128_842 = torch.constant.int 128
    %1257 = torch.prim.ListConstruct %int1_840, %566, %int8_841, %int128_842 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1258 = torch.aten.view %1244, %1257 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1258, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_843 = torch.constant.int 1
    %int8_844 = torch.constant.int 8
    %int128_845 = torch.constant.int 128
    %1259 = torch.prim.ListConstruct %int1_843, %566, %int8_844, %int128_845 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1260 = torch.aten.view %1254, %1259 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1260, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_846 = torch.constant.int 131072
    %none_847 = torch.constant.none
    %none_848 = torch.constant.none
    %cpu_849 = torch.constant.device "cpu"
    %false_850 = torch.constant.bool false
    %1261 = torch.aten.arange %int131072_846, %none_847, %none_848, %cpu_849, %false_850 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_851 = torch.constant.int 0
    %int128_852 = torch.constant.int 128
    %none_853 = torch.constant.none
    %none_854 = torch.constant.none
    %cpu_855 = torch.constant.device "cpu"
    %false_856 = torch.constant.bool false
    %1262 = torch.aten.arange.start %int0_851, %int128_852, %none_853, %none_854, %cpu_855, %false_856 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_857 = torch.constant.int 2
    %1263 = torch.aten.floor_divide.Scalar %1262, %int2_857 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_858 = torch.constant.int 6
    %1264 = torch.prims.convert_element_type %1263, %int6_858 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_859 = torch.constant.int 128
    %1265 = torch.aten.div.Scalar %1264, %int128_859 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_860 = torch.constant.float 2.000000e+00
    %1266 = torch.aten.mul.Scalar %1265, %float2.000000e00_860 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_861 = torch.constant.float 5.000000e+05
    %1267 = torch.aten.pow.Scalar %float5.000000e05_861, %1266 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1268 = torch.aten.reciprocal %1267 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_862 = torch.constant.float 1.000000e+00
    %1269 = torch.aten.mul.Scalar %1268, %float1.000000e00_862 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_863 = torch.constant.int 131072
    %int1_864 = torch.constant.int 1
    %1270 = torch.prim.ListConstruct %int131072_863, %int1_864 : (!torch.int, !torch.int) -> !torch.list<int>
    %1271 = torch.aten.view %1261, %1270 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1272 = torch.aten.mul.Tensor %1271, %1269 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_865 = torch.constant.int 1
    %1273 = torch.aten.size.int %1233, %int1_865 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_866 = torch.constant.int 0
    %1274 = torch.aten.add.int %int0_866, %1273 : !torch.int, !torch.int -> !torch.int
    %int0_867 = torch.constant.int 0
    %int0_868 = torch.constant.int 0
    %int1_869 = torch.constant.int 1
    %1275 = torch.aten.slice.Tensor %1272, %int0_867, %int0_868, %1274, %int1_869 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1275, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_870 = torch.constant.int 1
    %int0_871 = torch.constant.int 0
    %int9223372036854775807_872 = torch.constant.int 9223372036854775807
    %int1_873 = torch.constant.int 1
    %1276 = torch.aten.slice.Tensor %1275, %int1_870, %int0_871, %int9223372036854775807_872, %int1_873 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1276, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_874 = torch.constant.int 1
    %int0_875 = torch.constant.int 0
    %int9223372036854775807_876 = torch.constant.int 9223372036854775807
    %int1_877 = torch.constant.int 1
    %1277 = torch.aten.slice.Tensor %1276, %int1_874, %int0_875, %int9223372036854775807_876, %int1_877 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1277, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_878 = torch.constant.int 0
    %1278 = torch.aten.unsqueeze %1277, %int0_878 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1278, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_879 = torch.constant.int 1
    %int0_880 = torch.constant.int 0
    %int9223372036854775807_881 = torch.constant.int 9223372036854775807
    %int1_882 = torch.constant.int 1
    %1279 = torch.aten.slice.Tensor %1278, %int1_879, %int0_880, %int9223372036854775807_881, %int1_882 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1279, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_883 = torch.constant.int 2
    %int0_884 = torch.constant.int 0
    %int9223372036854775807_885 = torch.constant.int 9223372036854775807
    %int1_886 = torch.constant.int 1
    %1280 = torch.aten.slice.Tensor %1279, %int2_883, %int0_884, %int9223372036854775807_885, %int1_886 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1280, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_887 = torch.constant.int 1
    %int1_888 = torch.constant.int 1
    %int1_889 = torch.constant.int 1
    %1281 = torch.prim.ListConstruct %int1_887, %int1_888, %int1_889 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1282 = torch.aten.repeat %1280, %1281 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1282, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_890 = torch.constant.int 6
    %1283 = torch.prims.convert_element_type %1256, %int6_890 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1283, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1284 = torch_c.to_builtin_tensor %1283 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %1285 = torch_c.to_builtin_tensor %1282 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1286 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1284, %1285) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %1287 = torch_c.from_builtin_tensor %1286 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1287, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_891 = torch.constant.int 15
    %1288 = torch.prims.convert_element_type %1287, %int15_891 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1288, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_892 = torch.constant.int 131072
    %none_893 = torch.constant.none
    %none_894 = torch.constant.none
    %cpu_895 = torch.constant.device "cpu"
    %false_896 = torch.constant.bool false
    %1289 = torch.aten.arange %int131072_892, %none_893, %none_894, %cpu_895, %false_896 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_897 = torch.constant.int 0
    %int128_898 = torch.constant.int 128
    %none_899 = torch.constant.none
    %none_900 = torch.constant.none
    %cpu_901 = torch.constant.device "cpu"
    %false_902 = torch.constant.bool false
    %1290 = torch.aten.arange.start %int0_897, %int128_898, %none_899, %none_900, %cpu_901, %false_902 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_903 = torch.constant.int 2
    %1291 = torch.aten.floor_divide.Scalar %1290, %int2_903 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_904 = torch.constant.int 6
    %1292 = torch.prims.convert_element_type %1291, %int6_904 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_905 = torch.constant.int 128
    %1293 = torch.aten.div.Scalar %1292, %int128_905 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_906 = torch.constant.float 2.000000e+00
    %1294 = torch.aten.mul.Scalar %1293, %float2.000000e00_906 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_907 = torch.constant.float 5.000000e+05
    %1295 = torch.aten.pow.Scalar %float5.000000e05_907, %1294 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1296 = torch.aten.reciprocal %1295 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_908 = torch.constant.float 1.000000e+00
    %1297 = torch.aten.mul.Scalar %1296, %float1.000000e00_908 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_909 = torch.constant.int 131072
    %int1_910 = torch.constant.int 1
    %1298 = torch.prim.ListConstruct %int131072_909, %int1_910 : (!torch.int, !torch.int) -> !torch.list<int>
    %1299 = torch.aten.view %1289, %1298 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1300 = torch.aten.mul.Tensor %1299, %1297 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_911 = torch.constant.int 1
    %1301 = torch.aten.size.int %1243, %int1_911 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_912 = torch.constant.int 0
    %1302 = torch.aten.add.int %int0_912, %1301 : !torch.int, !torch.int -> !torch.int
    %int0_913 = torch.constant.int 0
    %int0_914 = torch.constant.int 0
    %int1_915 = torch.constant.int 1
    %1303 = torch.aten.slice.Tensor %1300, %int0_913, %int0_914, %1302, %int1_915 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1303, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_916 = torch.constant.int 1
    %int0_917 = torch.constant.int 0
    %int9223372036854775807_918 = torch.constant.int 9223372036854775807
    %int1_919 = torch.constant.int 1
    %1304 = torch.aten.slice.Tensor %1303, %int1_916, %int0_917, %int9223372036854775807_918, %int1_919 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1304, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_920 = torch.constant.int 1
    %int0_921 = torch.constant.int 0
    %int9223372036854775807_922 = torch.constant.int 9223372036854775807
    %int1_923 = torch.constant.int 1
    %1305 = torch.aten.slice.Tensor %1304, %int1_920, %int0_921, %int9223372036854775807_922, %int1_923 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1305, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_924 = torch.constant.int 0
    %1306 = torch.aten.unsqueeze %1305, %int0_924 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1306, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_925 = torch.constant.int 1
    %int0_926 = torch.constant.int 0
    %int9223372036854775807_927 = torch.constant.int 9223372036854775807
    %int1_928 = torch.constant.int 1
    %1307 = torch.aten.slice.Tensor %1306, %int1_925, %int0_926, %int9223372036854775807_927, %int1_928 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1307, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_929 = torch.constant.int 2
    %int0_930 = torch.constant.int 0
    %int9223372036854775807_931 = torch.constant.int 9223372036854775807
    %int1_932 = torch.constant.int 1
    %1308 = torch.aten.slice.Tensor %1307, %int2_929, %int0_930, %int9223372036854775807_931, %int1_932 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1308, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_933 = torch.constant.int 1
    %int1_934 = torch.constant.int 1
    %int1_935 = torch.constant.int 1
    %1309 = torch.prim.ListConstruct %int1_933, %int1_934, %int1_935 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1310 = torch.aten.repeat %1308, %1309 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1310, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_936 = torch.constant.int 6
    %1311 = torch.prims.convert_element_type %1258, %int6_936 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1311, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %1312 = torch_c.to_builtin_tensor %1311 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %1313 = torch_c.to_builtin_tensor %1310 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1314 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1312, %1313) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %1315 = torch_c.from_builtin_tensor %1314 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1315, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_937 = torch.constant.int 15
    %1316 = torch.prims.convert_element_type %1315, %int15_937 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1316, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1317 = torch.aten.div.Tensor %1316, %59 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1317, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_938 = torch.constant.float -2.400000e+02
    %float2.400000e02_939 = torch.constant.float 2.400000e+02
    %1318 = torch.aten.clamp %1317, %float-2.400000e02_938, %float2.400000e02_939 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1318, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_940 = torch.constant.int 26
    %1319 = torch.prims.convert_element_type %1318, %int26_940 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1319, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1320 = torch.aten.div.Tensor %1260, %59 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1320, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_941 = torch.constant.float -2.400000e+02
    %float2.400000e02_942 = torch.constant.float 2.400000e+02
    %1321 = torch.aten.clamp %1320, %float-2.400000e02_941, %float2.400000e02_942 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1321, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_943 = torch.constant.int 26
    %1322 = torch.prims.convert_element_type %1321, %int26_943 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1322, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_944 = torch.constant.int 64
    %1323 = torch.aten.mul.Scalar %arg2, %int64_944 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1323, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int6_945 = torch.constant.int 6
    %int1_946 = torch.constant.int 1
    %1324 = torch.aten.add.Scalar %1323, %int6_945, %int1_946 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1324, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_947 = torch.constant.int 1
    %int32_948 = torch.constant.int 32
    %int8_949 = torch.constant.int 8
    %int128_950 = torch.constant.int 128
    %1325 = torch.prim.ListConstruct %int1_947, %670, %int32_948, %int8_949, %int128_950 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1326 = torch.aten.view %1319, %1325 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1326, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_951 = torch.constant.int 32
    %int8_952 = torch.constant.int 8
    %int128_953 = torch.constant.int 128
    %1327 = torch.prim.ListConstruct %670, %int32_951, %int8_952, %int128_953 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1328 = torch.aten.view %1326, %1327 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1328, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1329 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1330 = torch.aten.view %1324, %1329 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1330, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_954 = torch.constant.int 32
    %int2_955 = torch.constant.int 2
    %int32_956 = torch.constant.int 32
    %int8_957 = torch.constant.int 8
    %int128_958 = torch.constant.int 128
    %1331 = torch.prim.ListConstruct %661, %int32_954, %int2_955, %int32_956, %int8_957, %int128_958 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1332 = torch.aten.view %1139, %1331 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1332, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_959 = torch.constant.int 32
    %1333 = torch.aten.mul.int %661, %int32_959 : !torch.int, !torch.int -> !torch.int
    %int2_960 = torch.constant.int 2
    %1334 = torch.aten.mul.int %1333, %int2_960 : !torch.int, !torch.int -> !torch.int
    %int32_961 = torch.constant.int 32
    %int8_962 = torch.constant.int 8
    %int128_963 = torch.constant.int 128
    %1335 = torch.prim.ListConstruct %1334, %int32_961, %int8_962, %int128_963 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1336 = torch.aten.view %1332, %1335 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1336, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %1337 = torch.prim.ListConstruct %1330 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_964 = torch.constant.bool false
    %1338 = torch.aten.index_put %1336, %1337, %1328, %false_964 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1338, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_965 = torch.constant.int 32
    %int2_966 = torch.constant.int 2
    %int32_967 = torch.constant.int 32
    %int8_968 = torch.constant.int 8
    %int128_969 = torch.constant.int 128
    %1339 = torch.prim.ListConstruct %661, %int32_965, %int2_966, %int32_967, %int8_968, %int128_969 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1340 = torch.aten.view %1338, %1339 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1340, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_970 = torch.constant.int 2097152
    %1341 = torch.prim.ListConstruct %661, %int2097152_970 : (!torch.int, !torch.int) -> !torch.list<int>
    %1342 = torch.aten.view %1340, %1341 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1342, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_971 = torch.constant.int 32
    %int2_972 = torch.constant.int 2
    %int32_973 = torch.constant.int 32
    %int8_974 = torch.constant.int 8
    %int128_975 = torch.constant.int 128
    %1343 = torch.prim.ListConstruct %661, %int32_971, %int2_972, %int32_973, %int8_974, %int128_975 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1344 = torch.aten.view %1342, %1343 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1344, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_976 = torch.constant.int 32
    %int8_977 = torch.constant.int 8
    %int128_978 = torch.constant.int 128
    %1345 = torch.prim.ListConstruct %1334, %int32_976, %int8_977, %int128_978 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1346 = torch.aten.view %1344, %1345 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1346, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_979 = torch.constant.int 1
    %int32_980 = torch.constant.int 32
    %int8_981 = torch.constant.int 8
    %int128_982 = torch.constant.int 128
    %1347 = torch.prim.ListConstruct %int1_979, %670, %int32_980, %int8_981, %int128_982 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1348 = torch.aten.view %1322, %1347 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1348, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_983 = torch.constant.int 32
    %int8_984 = torch.constant.int 8
    %int128_985 = torch.constant.int 128
    %1349 = torch.prim.ListConstruct %670, %int32_983, %int8_984, %int128_985 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1350 = torch.aten.view %1348, %1349 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1350, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_986 = torch.constant.int 1
    %int1_987 = torch.constant.int 1
    %1351 = torch.aten.add.Scalar %1324, %int1_986, %int1_987 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1351, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1352 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1353 = torch.aten.view %1351, %1352 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1353, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1354 = torch.prim.ListConstruct %1353 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_988 = torch.constant.bool false
    %1355 = torch.aten.index_put %1346, %1354, %1350, %false_988 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1355, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_989 = torch.constant.int 32
    %int2_990 = torch.constant.int 2
    %int32_991 = torch.constant.int 32
    %int8_992 = torch.constant.int 8
    %int128_993 = torch.constant.int 128
    %1356 = torch.prim.ListConstruct %661, %int32_989, %int2_990, %int32_991, %int8_992, %int128_993 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1357 = torch.aten.view %1355, %1356 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1357, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_994 = torch.constant.int 2097152
    %1358 = torch.prim.ListConstruct %661, %int2097152_994 : (!torch.int, !torch.int) -> !torch.list<int>
    %1359 = torch.aten.view %1357, %1358 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1359, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_995 = torch.constant.int -2
    %1360 = torch.aten.unsqueeze %1319, %int-2_995 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1360, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_996 = torch.constant.int 1
    %int8_997 = torch.constant.int 8
    %int4_998 = torch.constant.int 4
    %int128_999 = torch.constant.int 128
    %1361 = torch.prim.ListConstruct %int1_996, %1301, %int8_997, %int4_998, %int128_999 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1000 = torch.constant.bool false
    %1362 = torch.aten.expand %1360, %1361, %false_1000 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1362, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1001 = torch.constant.int 0
    %1363 = torch.aten.clone %1362, %int0_1001 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1363, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1002 = torch.constant.int 1
    %int32_1003 = torch.constant.int 32
    %int128_1004 = torch.constant.int 128
    %1364 = torch.prim.ListConstruct %int1_1002, %1301, %int32_1003, %int128_1004 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1365 = torch.aten._unsafe_view %1363, %1364 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1365, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_1005 = torch.constant.int -2
    %1366 = torch.aten.unsqueeze %1322, %int-2_1005 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1366, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1006 = torch.constant.int 1
    %1367 = torch.aten.size.int %1253, %int1_1006 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_1007 = torch.constant.int 1
    %int8_1008 = torch.constant.int 8
    %int4_1009 = torch.constant.int 4
    %int128_1010 = torch.constant.int 128
    %1368 = torch.prim.ListConstruct %int1_1007, %1367, %int8_1008, %int4_1009, %int128_1010 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1011 = torch.constant.bool false
    %1369 = torch.aten.expand %1366, %1368, %false_1011 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1369, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1012 = torch.constant.int 0
    %1370 = torch.aten.clone %1369, %int0_1012 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1370, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1013 = torch.constant.int 1
    %int32_1014 = torch.constant.int 32
    %int128_1015 = torch.constant.int 128
    %1371 = torch.prim.ListConstruct %int1_1013, %1367, %int32_1014, %int128_1015 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1372 = torch.aten._unsafe_view %1370, %1371 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1372, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_1016 = torch.constant.int 6
    %1373 = torch.prims.convert_element_type %1365, %int6_1016 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1373, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1374 = torch.aten.mul.Tensor %1373, %59 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1374, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1017 = torch.constant.int 15
    %1375 = torch.prims.convert_element_type %1374, %int15_1017 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1375, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_1018 = torch.constant.int 6
    %1376 = torch.prims.convert_element_type %1372, %int6_1018 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1376, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1377 = torch.aten.mul.Tensor %1376, %59 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1377, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1019 = torch.constant.int 15
    %1378 = torch.prims.convert_element_type %1377, %int15_1019 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1378, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1020 = torch.constant.int 1
    %int2_1021 = torch.constant.int 2
    %1379 = torch.aten.transpose.int %1288, %int1_1020, %int2_1021 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1379, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1022 = torch.constant.int 1
    %int2_1023 = torch.constant.int 2
    %1380 = torch.aten.transpose.int %1375, %int1_1022, %int2_1023 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1380, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1024 = torch.constant.int 1
    %int2_1025 = torch.constant.int 2
    %1381 = torch.aten.transpose.int %1378, %int1_1024, %int2_1025 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1381, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_1026 = torch.constant.float 0.000000e+00
    %true_1027 = torch.constant.bool true
    %none_1028 = torch.constant.none
    %none_1029 = torch.constant.none
    %1382:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1379, %1380, %1381, %float0.000000e00_1026, %true_1027, %none_1028, %none_1029) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %1382#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1030 = torch.constant.int 1
    %int2_1031 = torch.constant.int 2
    %1383 = torch.aten.transpose.int %1382#0, %int1_1030, %int2_1031 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1383, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1032 = torch.constant.int 1
    %int4096_1033 = torch.constant.int 4096
    %1384 = torch.prim.ListConstruct %int1_1032, %1273, %int4096_1033 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1385 = torch.aten.view %1383, %1384 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1385, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1386 = torch.aten.div.Tensor %1385, %60 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1386, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1034 = torch.constant.float -2.400000e+02
    %float2.400000e02_1035 = torch.constant.float 2.400000e+02
    %1387 = torch.aten.clamp %1386, %float-2.400000e02_1034, %float2.400000e02_1035 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1387, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1036 = torch.constant.int 26
    %1388 = torch.prims.convert_element_type %1387, %int26_1036 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1388, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1037 = torch.constant.int -2
    %int-1_1038 = torch.constant.int -1
    %1389 = torch.aten.transpose.int %61, %int-2_1037, %int-1_1038 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1039 = torch.constant.int 4096
    %1390 = torch.prim.ListConstruct %1273, %int4096_1039 : (!torch.int, !torch.int) -> !torch.list<int>
    %1391 = torch.aten.view %1388, %1390 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1391, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1392 = torch.aten.mm %1391, %1389 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1392, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1040 = torch.constant.int 1
    %int4096_1041 = torch.constant.int 4096
    %1393 = torch.prim.ListConstruct %int1_1040, %1273, %int4096_1041 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1394 = torch.aten.view %1392, %1393 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1394, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1042 = torch.constant.int 15
    %1395 = torch.prims.convert_element_type %1394, %int15_1042 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1395, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1043 = torch.constant.int 1
    %1396 = torch.aten.add.Tensor %1217, %1395, %int1_1043 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1396, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1044 = torch.constant.int 2
    %1397 = torch.aten.pow.Tensor_Scalar %1396, %int2_1044 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1397, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1045 = torch.constant.int -1
    %1398 = torch.prim.ListConstruct %int-1_1045 : (!torch.int) -> !torch.list<int>
    %true_1046 = torch.constant.bool true
    %none_1047 = torch.constant.none
    %1399 = torch.aten.mean.dim %1397, %1398, %true_1046, %none_1047 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1399, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1048 = torch.constant.float 1.000000e-05
    %int1_1049 = torch.constant.int 1
    %1400 = torch.aten.add.Scalar %1399, %float1.000000e-05_1048, %int1_1049 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1400, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1401 = torch.aten.rsqrt %1400 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1401, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1402 = torch.aten.mul.Tensor %1396, %1401 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1402, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1403 = torch.aten.mul.Tensor %62, %1402 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1403, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1404 = torch.aten.div.Tensor %1403, %63 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1404, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1050 = torch.constant.float -2.400000e+02
    %float2.400000e02_1051 = torch.constant.float 2.400000e+02
    %1405 = torch.aten.clamp %1404, %float-2.400000e02_1050, %float2.400000e02_1051 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1405, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1052 = torch.constant.int 26
    %1406 = torch.prims.convert_element_type %1405, %int26_1052 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1406, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1053 = torch.constant.int -2
    %int-1_1054 = torch.constant.int -1
    %1407 = torch.aten.transpose.int %64, %int-2_1053, %int-1_1054 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1055 = torch.constant.int 4096
    %1408 = torch.prim.ListConstruct %566, %int4096_1055 : (!torch.int, !torch.int) -> !torch.list<int>
    %1409 = torch.aten.view %1406, %1408 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1409, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1410 = torch.aten.mm %1409, %1407 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1410, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1056 = torch.constant.int 1
    %int14336_1057 = torch.constant.int 14336
    %1411 = torch.prim.ListConstruct %int1_1056, %566, %int14336_1057 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1412 = torch.aten.view %1410, %1411 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1412, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1058 = torch.constant.int 15
    %1413 = torch.prims.convert_element_type %1412, %int15_1058 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1413, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1414 = torch.aten.silu %1413 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1414, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1415 = torch.aten.div.Tensor %1403, %65 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1415, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1059 = torch.constant.float -2.400000e+02
    %float2.400000e02_1060 = torch.constant.float 2.400000e+02
    %1416 = torch.aten.clamp %1415, %float-2.400000e02_1059, %float2.400000e02_1060 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1416, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1061 = torch.constant.int 26
    %1417 = torch.prims.convert_element_type %1416, %int26_1061 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1417, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1062 = torch.constant.int -2
    %int-1_1063 = torch.constant.int -1
    %1418 = torch.aten.transpose.int %66, %int-2_1062, %int-1_1063 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1064 = torch.constant.int 4096
    %1419 = torch.prim.ListConstruct %566, %int4096_1064 : (!torch.int, !torch.int) -> !torch.list<int>
    %1420 = torch.aten.view %1417, %1419 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1420, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1421 = torch.aten.mm %1420, %1418 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1421, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1065 = torch.constant.int 1
    %int14336_1066 = torch.constant.int 14336
    %1422 = torch.prim.ListConstruct %int1_1065, %566, %int14336_1066 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1423 = torch.aten.view %1421, %1422 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1423, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1067 = torch.constant.int 15
    %1424 = torch.prims.convert_element_type %1423, %int15_1067 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1424, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1425 = torch.aten.mul.Tensor %1414, %1424 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1425, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1426 = torch.aten.div.Tensor %1425, %67 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1426, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_1068 = torch.constant.float -2.400000e+02
    %float2.400000e02_1069 = torch.constant.float 2.400000e+02
    %1427 = torch.aten.clamp %1426, %float-2.400000e02_1068, %float2.400000e02_1069 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1427, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_1070 = torch.constant.int 26
    %1428 = torch.prims.convert_element_type %1427, %int26_1070 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1428, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_1071 = torch.constant.int -2
    %int-1_1072 = torch.constant.int -1
    %1429 = torch.aten.transpose.int %68, %int-2_1071, %int-1_1072 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_1073 = torch.constant.int 1
    %1430 = torch.aten.size.int %1412, %int1_1073 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_1074 = torch.constant.int 14336
    %1431 = torch.prim.ListConstruct %1430, %int14336_1074 : (!torch.int, !torch.int) -> !torch.list<int>
    %1432 = torch.aten.view %1428, %1431 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1432, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %1433 = torch.aten.mm %1432, %1429 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1433, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1075 = torch.constant.int 1
    %int4096_1076 = torch.constant.int 4096
    %1434 = torch.prim.ListConstruct %int1_1075, %1430, %int4096_1076 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1435 = torch.aten.view %1433, %1434 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1435, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1077 = torch.constant.int 15
    %1436 = torch.prims.convert_element_type %1435, %int15_1077 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1436, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1078 = torch.constant.int 1
    %1437 = torch.aten.add.Tensor %1396, %1436, %int1_1078 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1437, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1079 = torch.constant.int 2
    %1438 = torch.aten.pow.Tensor_Scalar %1437, %int2_1079 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1438, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1080 = torch.constant.int -1
    %1439 = torch.prim.ListConstruct %int-1_1080 : (!torch.int) -> !torch.list<int>
    %true_1081 = torch.constant.bool true
    %none_1082 = torch.constant.none
    %1440 = torch.aten.mean.dim %1438, %1439, %true_1081, %none_1082 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1440, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1083 = torch.constant.float 1.000000e-05
    %int1_1084 = torch.constant.int 1
    %1441 = torch.aten.add.Scalar %1440, %float1.000000e-05_1083, %int1_1084 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1441, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1442 = torch.aten.rsqrt %1441 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1442, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1443 = torch.aten.mul.Tensor %1437, %1442 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1443, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1444 = torch.aten.mul.Tensor %69, %1443 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1444, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1445 = torch.aten.div.Tensor %1444, %70 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1445, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1085 = torch.constant.float -2.400000e+02
    %float2.400000e02_1086 = torch.constant.float 2.400000e+02
    %1446 = torch.aten.clamp %1445, %float-2.400000e02_1085, %float2.400000e02_1086 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1446, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1087 = torch.constant.int 26
    %1447 = torch.prims.convert_element_type %1446, %int26_1087 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1447, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1088 = torch.constant.int -2
    %int-1_1089 = torch.constant.int -1
    %1448 = torch.aten.transpose.int %71, %int-2_1088, %int-1_1089 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1090 = torch.constant.int 4096
    %1449 = torch.prim.ListConstruct %566, %int4096_1090 : (!torch.int, !torch.int) -> !torch.list<int>
    %1450 = torch.aten.view %1447, %1449 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1450, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1451 = torch.aten.mm %1450, %1448 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1451, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1091 = torch.constant.int 1
    %int4096_1092 = torch.constant.int 4096
    %1452 = torch.prim.ListConstruct %int1_1091, %566, %int4096_1092 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1453 = torch.aten.view %1451, %1452 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1453, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1093 = torch.constant.int 15
    %1454 = torch.prims.convert_element_type %1453, %int15_1093 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1454, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1455 = torch.aten.div.Tensor %1444, %72 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1455, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1094 = torch.constant.float -2.400000e+02
    %float2.400000e02_1095 = torch.constant.float 2.400000e+02
    %1456 = torch.aten.clamp %1455, %float-2.400000e02_1094, %float2.400000e02_1095 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1456, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1096 = torch.constant.int 26
    %1457 = torch.prims.convert_element_type %1456, %int26_1096 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1457, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1097 = torch.constant.int -2
    %int-1_1098 = torch.constant.int -1
    %1458 = torch.aten.transpose.int %73, %int-2_1097, %int-1_1098 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1099 = torch.constant.int 4096
    %1459 = torch.prim.ListConstruct %566, %int4096_1099 : (!torch.int, !torch.int) -> !torch.list<int>
    %1460 = torch.aten.view %1457, %1459 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1460, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1461 = torch.aten.mm %1460, %1458 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1461, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1100 = torch.constant.int 1
    %int1024_1101 = torch.constant.int 1024
    %1462 = torch.prim.ListConstruct %int1_1100, %566, %int1024_1101 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1463 = torch.aten.view %1461, %1462 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1463, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1102 = torch.constant.int 15
    %1464 = torch.prims.convert_element_type %1463, %int15_1102 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1464, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1465 = torch.aten.div.Tensor %1444, %74 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1465, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1103 = torch.constant.float -2.400000e+02
    %float2.400000e02_1104 = torch.constant.float 2.400000e+02
    %1466 = torch.aten.clamp %1465, %float-2.400000e02_1103, %float2.400000e02_1104 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1466, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1105 = torch.constant.int 26
    %1467 = torch.prims.convert_element_type %1466, %int26_1105 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1467, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1106 = torch.constant.int -2
    %int-1_1107 = torch.constant.int -1
    %1468 = torch.aten.transpose.int %75, %int-2_1106, %int-1_1107 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1108 = torch.constant.int 4096
    %1469 = torch.prim.ListConstruct %566, %int4096_1108 : (!torch.int, !torch.int) -> !torch.list<int>
    %1470 = torch.aten.view %1467, %1469 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1470, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1471 = torch.aten.mm %1470, %1468 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1471, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1109 = torch.constant.int 1
    %int1024_1110 = torch.constant.int 1024
    %1472 = torch.prim.ListConstruct %int1_1109, %566, %int1024_1110 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1473 = torch.aten.view %1471, %1472 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1473, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1111 = torch.constant.int 15
    %1474 = torch.prims.convert_element_type %1473, %int15_1111 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1474, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_1112 = torch.constant.int 1
    %int32_1113 = torch.constant.int 32
    %int128_1114 = torch.constant.int 128
    %1475 = torch.prim.ListConstruct %int1_1112, %566, %int32_1113, %int128_1114 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1476 = torch.aten.view %1454, %1475 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1476, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1115 = torch.constant.int 1
    %int8_1116 = torch.constant.int 8
    %int128_1117 = torch.constant.int 128
    %1477 = torch.prim.ListConstruct %int1_1115, %566, %int8_1116, %int128_1117 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1478 = torch.aten.view %1464, %1477 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1478, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_1118 = torch.constant.int 1
    %int8_1119 = torch.constant.int 8
    %int128_1120 = torch.constant.int 128
    %1479 = torch.prim.ListConstruct %int1_1118, %566, %int8_1119, %int128_1120 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1480 = torch.aten.view %1474, %1479 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1480, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_1121 = torch.constant.int 131072
    %none_1122 = torch.constant.none
    %none_1123 = torch.constant.none
    %cpu_1124 = torch.constant.device "cpu"
    %false_1125 = torch.constant.bool false
    %1481 = torch.aten.arange %int131072_1121, %none_1122, %none_1123, %cpu_1124, %false_1125 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1126 = torch.constant.int 0
    %int128_1127 = torch.constant.int 128
    %none_1128 = torch.constant.none
    %none_1129 = torch.constant.none
    %cpu_1130 = torch.constant.device "cpu"
    %false_1131 = torch.constant.bool false
    %1482 = torch.aten.arange.start %int0_1126, %int128_1127, %none_1128, %none_1129, %cpu_1130, %false_1131 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_1132 = torch.constant.int 2
    %1483 = torch.aten.floor_divide.Scalar %1482, %int2_1132 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_1133 = torch.constant.int 6
    %1484 = torch.prims.convert_element_type %1483, %int6_1133 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_1134 = torch.constant.int 128
    %1485 = torch.aten.div.Scalar %1484, %int128_1134 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_1135 = torch.constant.float 2.000000e+00
    %1486 = torch.aten.mul.Scalar %1485, %float2.000000e00_1135 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_1136 = torch.constant.float 5.000000e+05
    %1487 = torch.aten.pow.Scalar %float5.000000e05_1136, %1486 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1488 = torch.aten.reciprocal %1487 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_1137 = torch.constant.float 1.000000e+00
    %1489 = torch.aten.mul.Scalar %1488, %float1.000000e00_1137 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_1138 = torch.constant.int 131072
    %int1_1139 = torch.constant.int 1
    %1490 = torch.prim.ListConstruct %int131072_1138, %int1_1139 : (!torch.int, !torch.int) -> !torch.list<int>
    %1491 = torch.aten.view %1481, %1490 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1492 = torch.aten.mul.Tensor %1491, %1489 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_1140 = torch.constant.int 1
    %1493 = torch.aten.size.int %1453, %int1_1140 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1141 = torch.constant.int 0
    %1494 = torch.aten.add.int %int0_1141, %1493 : !torch.int, !torch.int -> !torch.int
    %int0_1142 = torch.constant.int 0
    %int0_1143 = torch.constant.int 0
    %int1_1144 = torch.constant.int 1
    %1495 = torch.aten.slice.Tensor %1492, %int0_1142, %int0_1143, %1494, %int1_1144 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1495, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1145 = torch.constant.int 1
    %int0_1146 = torch.constant.int 0
    %int9223372036854775807_1147 = torch.constant.int 9223372036854775807
    %int1_1148 = torch.constant.int 1
    %1496 = torch.aten.slice.Tensor %1495, %int1_1145, %int0_1146, %int9223372036854775807_1147, %int1_1148 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1496, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1149 = torch.constant.int 1
    %int0_1150 = torch.constant.int 0
    %int9223372036854775807_1151 = torch.constant.int 9223372036854775807
    %int1_1152 = torch.constant.int 1
    %1497 = torch.aten.slice.Tensor %1496, %int1_1149, %int0_1150, %int9223372036854775807_1151, %int1_1152 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1497, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_1153 = torch.constant.int 0
    %1498 = torch.aten.unsqueeze %1497, %int0_1153 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1498, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1154 = torch.constant.int 1
    %int0_1155 = torch.constant.int 0
    %int9223372036854775807_1156 = torch.constant.int 9223372036854775807
    %int1_1157 = torch.constant.int 1
    %1499 = torch.aten.slice.Tensor %1498, %int1_1154, %int0_1155, %int9223372036854775807_1156, %int1_1157 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1499, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_1158 = torch.constant.int 2
    %int0_1159 = torch.constant.int 0
    %int9223372036854775807_1160 = torch.constant.int 9223372036854775807
    %int1_1161 = torch.constant.int 1
    %1500 = torch.aten.slice.Tensor %1499, %int2_1158, %int0_1159, %int9223372036854775807_1160, %int1_1161 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1500, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1162 = torch.constant.int 1
    %int1_1163 = torch.constant.int 1
    %int1_1164 = torch.constant.int 1
    %1501 = torch.prim.ListConstruct %int1_1162, %int1_1163, %int1_1164 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1502 = torch.aten.repeat %1500, %1501 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1502, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_1165 = torch.constant.int 6
    %1503 = torch.prims.convert_element_type %1476, %int6_1165 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1503, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1504 = torch_c.to_builtin_tensor %1503 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %1505 = torch_c.to_builtin_tensor %1502 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1506 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1504, %1505) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %1507 = torch_c.from_builtin_tensor %1506 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1507, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1166 = torch.constant.int 15
    %1508 = torch.prims.convert_element_type %1507, %int15_1166 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1508, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_1167 = torch.constant.int 131072
    %none_1168 = torch.constant.none
    %none_1169 = torch.constant.none
    %cpu_1170 = torch.constant.device "cpu"
    %false_1171 = torch.constant.bool false
    %1509 = torch.aten.arange %int131072_1167, %none_1168, %none_1169, %cpu_1170, %false_1171 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1172 = torch.constant.int 0
    %int128_1173 = torch.constant.int 128
    %none_1174 = torch.constant.none
    %none_1175 = torch.constant.none
    %cpu_1176 = torch.constant.device "cpu"
    %false_1177 = torch.constant.bool false
    %1510 = torch.aten.arange.start %int0_1172, %int128_1173, %none_1174, %none_1175, %cpu_1176, %false_1177 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_1178 = torch.constant.int 2
    %1511 = torch.aten.floor_divide.Scalar %1510, %int2_1178 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_1179 = torch.constant.int 6
    %1512 = torch.prims.convert_element_type %1511, %int6_1179 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_1180 = torch.constant.int 128
    %1513 = torch.aten.div.Scalar %1512, %int128_1180 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_1181 = torch.constant.float 2.000000e+00
    %1514 = torch.aten.mul.Scalar %1513, %float2.000000e00_1181 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_1182 = torch.constant.float 5.000000e+05
    %1515 = torch.aten.pow.Scalar %float5.000000e05_1182, %1514 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1516 = torch.aten.reciprocal %1515 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_1183 = torch.constant.float 1.000000e+00
    %1517 = torch.aten.mul.Scalar %1516, %float1.000000e00_1183 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_1184 = torch.constant.int 131072
    %int1_1185 = torch.constant.int 1
    %1518 = torch.prim.ListConstruct %int131072_1184, %int1_1185 : (!torch.int, !torch.int) -> !torch.list<int>
    %1519 = torch.aten.view %1509, %1518 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1520 = torch.aten.mul.Tensor %1519, %1517 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_1186 = torch.constant.int 1
    %1521 = torch.aten.size.int %1463, %int1_1186 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1187 = torch.constant.int 0
    %1522 = torch.aten.add.int %int0_1187, %1521 : !torch.int, !torch.int -> !torch.int
    %int0_1188 = torch.constant.int 0
    %int0_1189 = torch.constant.int 0
    %int1_1190 = torch.constant.int 1
    %1523 = torch.aten.slice.Tensor %1520, %int0_1188, %int0_1189, %1522, %int1_1190 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1523, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1191 = torch.constant.int 1
    %int0_1192 = torch.constant.int 0
    %int9223372036854775807_1193 = torch.constant.int 9223372036854775807
    %int1_1194 = torch.constant.int 1
    %1524 = torch.aten.slice.Tensor %1523, %int1_1191, %int0_1192, %int9223372036854775807_1193, %int1_1194 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1524, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1195 = torch.constant.int 1
    %int0_1196 = torch.constant.int 0
    %int9223372036854775807_1197 = torch.constant.int 9223372036854775807
    %int1_1198 = torch.constant.int 1
    %1525 = torch.aten.slice.Tensor %1524, %int1_1195, %int0_1196, %int9223372036854775807_1197, %int1_1198 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1525, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_1199 = torch.constant.int 0
    %1526 = torch.aten.unsqueeze %1525, %int0_1199 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1526, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1200 = torch.constant.int 1
    %int0_1201 = torch.constant.int 0
    %int9223372036854775807_1202 = torch.constant.int 9223372036854775807
    %int1_1203 = torch.constant.int 1
    %1527 = torch.aten.slice.Tensor %1526, %int1_1200, %int0_1201, %int9223372036854775807_1202, %int1_1203 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1527, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_1204 = torch.constant.int 2
    %int0_1205 = torch.constant.int 0
    %int9223372036854775807_1206 = torch.constant.int 9223372036854775807
    %int1_1207 = torch.constant.int 1
    %1528 = torch.aten.slice.Tensor %1527, %int2_1204, %int0_1205, %int9223372036854775807_1206, %int1_1207 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1528, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1208 = torch.constant.int 1
    %int1_1209 = torch.constant.int 1
    %int1_1210 = torch.constant.int 1
    %1529 = torch.prim.ListConstruct %int1_1208, %int1_1209, %int1_1210 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1530 = torch.aten.repeat %1528, %1529 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1530, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_1211 = torch.constant.int 6
    %1531 = torch.prims.convert_element_type %1478, %int6_1211 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1531, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %1532 = torch_c.to_builtin_tensor %1531 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %1533 = torch_c.to_builtin_tensor %1530 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1534 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1532, %1533) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %1535 = torch_c.from_builtin_tensor %1534 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1535, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_1212 = torch.constant.int 15
    %1536 = torch.prims.convert_element_type %1535, %int15_1212 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1536, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1537 = torch.aten.div.Tensor %1536, %76 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1537, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1213 = torch.constant.float -2.400000e+02
    %float2.400000e02_1214 = torch.constant.float 2.400000e+02
    %1538 = torch.aten.clamp %1537, %float-2.400000e02_1213, %float2.400000e02_1214 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1538, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1215 = torch.constant.int 26
    %1539 = torch.prims.convert_element_type %1538, %int26_1215 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1539, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1540 = torch.aten.div.Tensor %1480, %76 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1540, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1216 = torch.constant.float -2.400000e+02
    %float2.400000e02_1217 = torch.constant.float 2.400000e+02
    %1541 = torch.aten.clamp %1540, %float-2.400000e02_1216, %float2.400000e02_1217 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1541, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1218 = torch.constant.int 26
    %1542 = torch.prims.convert_element_type %1541, %int26_1218 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1542, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_1219 = torch.constant.int 64
    %1543 = torch.aten.mul.Scalar %arg2, %int64_1219 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1543, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int8_1220 = torch.constant.int 8
    %int1_1221 = torch.constant.int 1
    %1544 = torch.aten.add.Scalar %1543, %int8_1220, %int1_1221 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1544, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_1222 = torch.constant.int 1
    %int32_1223 = torch.constant.int 32
    %int8_1224 = torch.constant.int 8
    %int128_1225 = torch.constant.int 128
    %1545 = torch.prim.ListConstruct %int1_1222, %670, %int32_1223, %int8_1224, %int128_1225 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1546 = torch.aten.view %1539, %1545 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1546, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1226 = torch.constant.int 32
    %int8_1227 = torch.constant.int 8
    %int128_1228 = torch.constant.int 128
    %1547 = torch.prim.ListConstruct %670, %int32_1226, %int8_1227, %int128_1228 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1548 = torch.aten.view %1546, %1547 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1548, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1549 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1550 = torch.aten.view %1544, %1549 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1550, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_1229 = torch.constant.int 32
    %int2_1230 = torch.constant.int 2
    %int32_1231 = torch.constant.int 32
    %int8_1232 = torch.constant.int 8
    %int128_1233 = torch.constant.int 128
    %1551 = torch.prim.ListConstruct %661, %int32_1229, %int2_1230, %int32_1231, %int8_1232, %int128_1233 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1552 = torch.aten.view %1359, %1551 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1552, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_1234 = torch.constant.int 32
    %1553 = torch.aten.mul.int %661, %int32_1234 : !torch.int, !torch.int -> !torch.int
    %int2_1235 = torch.constant.int 2
    %1554 = torch.aten.mul.int %1553, %int2_1235 : !torch.int, !torch.int -> !torch.int
    %int32_1236 = torch.constant.int 32
    %int8_1237 = torch.constant.int 8
    %int128_1238 = torch.constant.int 128
    %1555 = torch.prim.ListConstruct %1554, %int32_1236, %int8_1237, %int128_1238 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1556 = torch.aten.view %1552, %1555 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1556, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %1557 = torch.prim.ListConstruct %1550 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1239 = torch.constant.bool false
    %1558 = torch.aten.index_put %1556, %1557, %1548, %false_1239 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1558, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_1240 = torch.constant.int 32
    %int2_1241 = torch.constant.int 2
    %int32_1242 = torch.constant.int 32
    %int8_1243 = torch.constant.int 8
    %int128_1244 = torch.constant.int 128
    %1559 = torch.prim.ListConstruct %661, %int32_1240, %int2_1241, %int32_1242, %int8_1243, %int128_1244 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1560 = torch.aten.view %1558, %1559 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1560, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_1245 = torch.constant.int 2097152
    %1561 = torch.prim.ListConstruct %661, %int2097152_1245 : (!torch.int, !torch.int) -> !torch.list<int>
    %1562 = torch.aten.view %1560, %1561 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1562, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_1246 = torch.constant.int 32
    %int2_1247 = torch.constant.int 2
    %int32_1248 = torch.constant.int 32
    %int8_1249 = torch.constant.int 8
    %int128_1250 = torch.constant.int 128
    %1563 = torch.prim.ListConstruct %661, %int32_1246, %int2_1247, %int32_1248, %int8_1249, %int128_1250 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1564 = torch.aten.view %1562, %1563 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1564, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_1251 = torch.constant.int 32
    %int8_1252 = torch.constant.int 8
    %int128_1253 = torch.constant.int 128
    %1565 = torch.prim.ListConstruct %1554, %int32_1251, %int8_1252, %int128_1253 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1566 = torch.aten.view %1564, %1565 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1566, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_1254 = torch.constant.int 1
    %int32_1255 = torch.constant.int 32
    %int8_1256 = torch.constant.int 8
    %int128_1257 = torch.constant.int 128
    %1567 = torch.prim.ListConstruct %int1_1254, %670, %int32_1255, %int8_1256, %int128_1257 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1568 = torch.aten.view %1542, %1567 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1568, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1258 = torch.constant.int 32
    %int8_1259 = torch.constant.int 8
    %int128_1260 = torch.constant.int 128
    %1569 = torch.prim.ListConstruct %670, %int32_1258, %int8_1259, %int128_1260 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1570 = torch.aten.view %1568, %1569 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1570, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1261 = torch.constant.int 1
    %int1_1262 = torch.constant.int 1
    %1571 = torch.aten.add.Scalar %1544, %int1_1261, %int1_1262 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1571, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1572 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1573 = torch.aten.view %1571, %1572 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1573, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1574 = torch.prim.ListConstruct %1573 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1263 = torch.constant.bool false
    %1575 = torch.aten.index_put %1566, %1574, %1570, %false_1263 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1575, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_1264 = torch.constant.int 32
    %int2_1265 = torch.constant.int 2
    %int32_1266 = torch.constant.int 32
    %int8_1267 = torch.constant.int 8
    %int128_1268 = torch.constant.int 128
    %1576 = torch.prim.ListConstruct %661, %int32_1264, %int2_1265, %int32_1266, %int8_1267, %int128_1268 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1577 = torch.aten.view %1575, %1576 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1577, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_1269 = torch.constant.int 2097152
    %1578 = torch.prim.ListConstruct %661, %int2097152_1269 : (!torch.int, !torch.int) -> !torch.list<int>
    %1579 = torch.aten.view %1577, %1578 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1579, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_1270 = torch.constant.int -2
    %1580 = torch.aten.unsqueeze %1539, %int-2_1270 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1580, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1271 = torch.constant.int 1
    %int8_1272 = torch.constant.int 8
    %int4_1273 = torch.constant.int 4
    %int128_1274 = torch.constant.int 128
    %1581 = torch.prim.ListConstruct %int1_1271, %1521, %int8_1272, %int4_1273, %int128_1274 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1275 = torch.constant.bool false
    %1582 = torch.aten.expand %1580, %1581, %false_1275 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1582, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1276 = torch.constant.int 0
    %1583 = torch.aten.clone %1582, %int0_1276 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1583, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1277 = torch.constant.int 1
    %int32_1278 = torch.constant.int 32
    %int128_1279 = torch.constant.int 128
    %1584 = torch.prim.ListConstruct %int1_1277, %1521, %int32_1278, %int128_1279 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1585 = torch.aten._unsafe_view %1583, %1584 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1585, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_1280 = torch.constant.int -2
    %1586 = torch.aten.unsqueeze %1542, %int-2_1280 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1586, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1281 = torch.constant.int 1
    %1587 = torch.aten.size.int %1473, %int1_1281 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_1282 = torch.constant.int 1
    %int8_1283 = torch.constant.int 8
    %int4_1284 = torch.constant.int 4
    %int128_1285 = torch.constant.int 128
    %1588 = torch.prim.ListConstruct %int1_1282, %1587, %int8_1283, %int4_1284, %int128_1285 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1286 = torch.constant.bool false
    %1589 = torch.aten.expand %1586, %1588, %false_1286 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1589, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1287 = torch.constant.int 0
    %1590 = torch.aten.clone %1589, %int0_1287 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1590, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1288 = torch.constant.int 1
    %int32_1289 = torch.constant.int 32
    %int128_1290 = torch.constant.int 128
    %1591 = torch.prim.ListConstruct %int1_1288, %1587, %int32_1289, %int128_1290 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1592 = torch.aten._unsafe_view %1590, %1591 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1592, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_1291 = torch.constant.int 6
    %1593 = torch.prims.convert_element_type %1585, %int6_1291 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1593, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1594 = torch.aten.mul.Tensor %1593, %76 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1594, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1292 = torch.constant.int 15
    %1595 = torch.prims.convert_element_type %1594, %int15_1292 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1595, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_1293 = torch.constant.int 6
    %1596 = torch.prims.convert_element_type %1592, %int6_1293 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1596, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1597 = torch.aten.mul.Tensor %1596, %76 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1597, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1294 = torch.constant.int 15
    %1598 = torch.prims.convert_element_type %1597, %int15_1294 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1598, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1295 = torch.constant.int 1
    %int2_1296 = torch.constant.int 2
    %1599 = torch.aten.transpose.int %1508, %int1_1295, %int2_1296 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1599, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1297 = torch.constant.int 1
    %int2_1298 = torch.constant.int 2
    %1600 = torch.aten.transpose.int %1595, %int1_1297, %int2_1298 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1600, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1299 = torch.constant.int 1
    %int2_1300 = torch.constant.int 2
    %1601 = torch.aten.transpose.int %1598, %int1_1299, %int2_1300 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1601, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_1301 = torch.constant.float 0.000000e+00
    %true_1302 = torch.constant.bool true
    %none_1303 = torch.constant.none
    %none_1304 = torch.constant.none
    %1602:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1599, %1600, %1601, %float0.000000e00_1301, %true_1302, %none_1303, %none_1304) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %1602#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1305 = torch.constant.int 1
    %int2_1306 = torch.constant.int 2
    %1603 = torch.aten.transpose.int %1602#0, %int1_1305, %int2_1306 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1603, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1307 = torch.constant.int 1
    %int4096_1308 = torch.constant.int 4096
    %1604 = torch.prim.ListConstruct %int1_1307, %1493, %int4096_1308 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1605 = torch.aten.view %1603, %1604 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1605, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1606 = torch.aten.div.Tensor %1605, %77 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1606, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1309 = torch.constant.float -2.400000e+02
    %float2.400000e02_1310 = torch.constant.float 2.400000e+02
    %1607 = torch.aten.clamp %1606, %float-2.400000e02_1309, %float2.400000e02_1310 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1607, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1311 = torch.constant.int 26
    %1608 = torch.prims.convert_element_type %1607, %int26_1311 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1608, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1312 = torch.constant.int -2
    %int-1_1313 = torch.constant.int -1
    %1609 = torch.aten.transpose.int %78, %int-2_1312, %int-1_1313 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1314 = torch.constant.int 4096
    %1610 = torch.prim.ListConstruct %1493, %int4096_1314 : (!torch.int, !torch.int) -> !torch.list<int>
    %1611 = torch.aten.view %1608, %1610 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1611, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1612 = torch.aten.mm %1611, %1609 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1612, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1315 = torch.constant.int 1
    %int4096_1316 = torch.constant.int 4096
    %1613 = torch.prim.ListConstruct %int1_1315, %1493, %int4096_1316 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1614 = torch.aten.view %1612, %1613 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1614, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1317 = torch.constant.int 15
    %1615 = torch.prims.convert_element_type %1614, %int15_1317 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1615, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1318 = torch.constant.int 1
    %1616 = torch.aten.add.Tensor %1437, %1615, %int1_1318 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1616, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1319 = torch.constant.int 2
    %1617 = torch.aten.pow.Tensor_Scalar %1616, %int2_1319 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1617, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1320 = torch.constant.int -1
    %1618 = torch.prim.ListConstruct %int-1_1320 : (!torch.int) -> !torch.list<int>
    %true_1321 = torch.constant.bool true
    %none_1322 = torch.constant.none
    %1619 = torch.aten.mean.dim %1617, %1618, %true_1321, %none_1322 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1619, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1323 = torch.constant.float 1.000000e-05
    %int1_1324 = torch.constant.int 1
    %1620 = torch.aten.add.Scalar %1619, %float1.000000e-05_1323, %int1_1324 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1620, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1621 = torch.aten.rsqrt %1620 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1621, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1622 = torch.aten.mul.Tensor %1616, %1621 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1622, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1623 = torch.aten.mul.Tensor %79, %1622 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1623, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1624 = torch.aten.div.Tensor %1623, %80 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1624, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1325 = torch.constant.float -2.400000e+02
    %float2.400000e02_1326 = torch.constant.float 2.400000e+02
    %1625 = torch.aten.clamp %1624, %float-2.400000e02_1325, %float2.400000e02_1326 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1625, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1327 = torch.constant.int 26
    %1626 = torch.prims.convert_element_type %1625, %int26_1327 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1626, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1328 = torch.constant.int -2
    %int-1_1329 = torch.constant.int -1
    %1627 = torch.aten.transpose.int %81, %int-2_1328, %int-1_1329 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1330 = torch.constant.int 4096
    %1628 = torch.prim.ListConstruct %566, %int4096_1330 : (!torch.int, !torch.int) -> !torch.list<int>
    %1629 = torch.aten.view %1626, %1628 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1629, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1630 = torch.aten.mm %1629, %1627 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1630, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1331 = torch.constant.int 1
    %int14336_1332 = torch.constant.int 14336
    %1631 = torch.prim.ListConstruct %int1_1331, %566, %int14336_1332 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1632 = torch.aten.view %1630, %1631 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1632, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1333 = torch.constant.int 15
    %1633 = torch.prims.convert_element_type %1632, %int15_1333 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1633, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1634 = torch.aten.silu %1633 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1634, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1635 = torch.aten.div.Tensor %1623, %82 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1635, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1334 = torch.constant.float -2.400000e+02
    %float2.400000e02_1335 = torch.constant.float 2.400000e+02
    %1636 = torch.aten.clamp %1635, %float-2.400000e02_1334, %float2.400000e02_1335 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1636, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1336 = torch.constant.int 26
    %1637 = torch.prims.convert_element_type %1636, %int26_1336 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1637, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1337 = torch.constant.int -2
    %int-1_1338 = torch.constant.int -1
    %1638 = torch.aten.transpose.int %83, %int-2_1337, %int-1_1338 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1339 = torch.constant.int 4096
    %1639 = torch.prim.ListConstruct %566, %int4096_1339 : (!torch.int, !torch.int) -> !torch.list<int>
    %1640 = torch.aten.view %1637, %1639 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1640, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1641 = torch.aten.mm %1640, %1638 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1641, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1340 = torch.constant.int 1
    %int14336_1341 = torch.constant.int 14336
    %1642 = torch.prim.ListConstruct %int1_1340, %566, %int14336_1341 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1643 = torch.aten.view %1641, %1642 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1643, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1342 = torch.constant.int 15
    %1644 = torch.prims.convert_element_type %1643, %int15_1342 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1644, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1645 = torch.aten.mul.Tensor %1634, %1644 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1645, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1646 = torch.aten.div.Tensor %1645, %84 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1646, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_1343 = torch.constant.float -2.400000e+02
    %float2.400000e02_1344 = torch.constant.float 2.400000e+02
    %1647 = torch.aten.clamp %1646, %float-2.400000e02_1343, %float2.400000e02_1344 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1647, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_1345 = torch.constant.int 26
    %1648 = torch.prims.convert_element_type %1647, %int26_1345 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1648, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_1346 = torch.constant.int -2
    %int-1_1347 = torch.constant.int -1
    %1649 = torch.aten.transpose.int %85, %int-2_1346, %int-1_1347 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_1348 = torch.constant.int 1
    %1650 = torch.aten.size.int %1632, %int1_1348 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_1349 = torch.constant.int 14336
    %1651 = torch.prim.ListConstruct %1650, %int14336_1349 : (!torch.int, !torch.int) -> !torch.list<int>
    %1652 = torch.aten.view %1648, %1651 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1652, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %1653 = torch.aten.mm %1652, %1649 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1653, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1350 = torch.constant.int 1
    %int4096_1351 = torch.constant.int 4096
    %1654 = torch.prim.ListConstruct %int1_1350, %1650, %int4096_1351 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1655 = torch.aten.view %1653, %1654 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1655, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1352 = torch.constant.int 15
    %1656 = torch.prims.convert_element_type %1655, %int15_1352 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1656, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1353 = torch.constant.int 1
    %1657 = torch.aten.add.Tensor %1616, %1656, %int1_1353 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1657, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1354 = torch.constant.int 2
    %1658 = torch.aten.pow.Tensor_Scalar %1657, %int2_1354 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1658, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1355 = torch.constant.int -1
    %1659 = torch.prim.ListConstruct %int-1_1355 : (!torch.int) -> !torch.list<int>
    %true_1356 = torch.constant.bool true
    %none_1357 = torch.constant.none
    %1660 = torch.aten.mean.dim %1658, %1659, %true_1356, %none_1357 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1660, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1358 = torch.constant.float 1.000000e-05
    %int1_1359 = torch.constant.int 1
    %1661 = torch.aten.add.Scalar %1660, %float1.000000e-05_1358, %int1_1359 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1661, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1662 = torch.aten.rsqrt %1661 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1662, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1663 = torch.aten.mul.Tensor %1657, %1662 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1663, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1664 = torch.aten.mul.Tensor %86, %1663 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1664, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1665 = torch.aten.div.Tensor %1664, %87 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1665, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1360 = torch.constant.float -2.400000e+02
    %float2.400000e02_1361 = torch.constant.float 2.400000e+02
    %1666 = torch.aten.clamp %1665, %float-2.400000e02_1360, %float2.400000e02_1361 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1666, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1362 = torch.constant.int 26
    %1667 = torch.prims.convert_element_type %1666, %int26_1362 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1667, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1363 = torch.constant.int -2
    %int-1_1364 = torch.constant.int -1
    %1668 = torch.aten.transpose.int %88, %int-2_1363, %int-1_1364 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1365 = torch.constant.int 4096
    %1669 = torch.prim.ListConstruct %566, %int4096_1365 : (!torch.int, !torch.int) -> !torch.list<int>
    %1670 = torch.aten.view %1667, %1669 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1670, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1671 = torch.aten.mm %1670, %1668 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1671, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1366 = torch.constant.int 1
    %int4096_1367 = torch.constant.int 4096
    %1672 = torch.prim.ListConstruct %int1_1366, %566, %int4096_1367 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1673 = torch.aten.view %1671, %1672 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1673, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1368 = torch.constant.int 15
    %1674 = torch.prims.convert_element_type %1673, %int15_1368 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1674, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1675 = torch.aten.div.Tensor %1664, %89 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1675, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1369 = torch.constant.float -2.400000e+02
    %float2.400000e02_1370 = torch.constant.float 2.400000e+02
    %1676 = torch.aten.clamp %1675, %float-2.400000e02_1369, %float2.400000e02_1370 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1676, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1371 = torch.constant.int 26
    %1677 = torch.prims.convert_element_type %1676, %int26_1371 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1677, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1372 = torch.constant.int -2
    %int-1_1373 = torch.constant.int -1
    %1678 = torch.aten.transpose.int %90, %int-2_1372, %int-1_1373 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1374 = torch.constant.int 4096
    %1679 = torch.prim.ListConstruct %566, %int4096_1374 : (!torch.int, !torch.int) -> !torch.list<int>
    %1680 = torch.aten.view %1677, %1679 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1680, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1681 = torch.aten.mm %1680, %1678 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1681, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1375 = torch.constant.int 1
    %int1024_1376 = torch.constant.int 1024
    %1682 = torch.prim.ListConstruct %int1_1375, %566, %int1024_1376 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1683 = torch.aten.view %1681, %1682 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1683, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1377 = torch.constant.int 15
    %1684 = torch.prims.convert_element_type %1683, %int15_1377 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1684, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1685 = torch.aten.div.Tensor %1664, %91 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1685, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1378 = torch.constant.float -2.400000e+02
    %float2.400000e02_1379 = torch.constant.float 2.400000e+02
    %1686 = torch.aten.clamp %1685, %float-2.400000e02_1378, %float2.400000e02_1379 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1686, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1380 = torch.constant.int 26
    %1687 = torch.prims.convert_element_type %1686, %int26_1380 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1687, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1381 = torch.constant.int -2
    %int-1_1382 = torch.constant.int -1
    %1688 = torch.aten.transpose.int %92, %int-2_1381, %int-1_1382 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1383 = torch.constant.int 4096
    %1689 = torch.prim.ListConstruct %566, %int4096_1383 : (!torch.int, !torch.int) -> !torch.list<int>
    %1690 = torch.aten.view %1687, %1689 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1690, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1691 = torch.aten.mm %1690, %1688 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1691, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1384 = torch.constant.int 1
    %int1024_1385 = torch.constant.int 1024
    %1692 = torch.prim.ListConstruct %int1_1384, %566, %int1024_1385 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1693 = torch.aten.view %1691, %1692 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1693, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1386 = torch.constant.int 15
    %1694 = torch.prims.convert_element_type %1693, %int15_1386 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1694, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_1387 = torch.constant.int 1
    %int32_1388 = torch.constant.int 32
    %int128_1389 = torch.constant.int 128
    %1695 = torch.prim.ListConstruct %int1_1387, %566, %int32_1388, %int128_1389 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1696 = torch.aten.view %1674, %1695 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1696, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1390 = torch.constant.int 1
    %int8_1391 = torch.constant.int 8
    %int128_1392 = torch.constant.int 128
    %1697 = torch.prim.ListConstruct %int1_1390, %566, %int8_1391, %int128_1392 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1698 = torch.aten.view %1684, %1697 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1698, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_1393 = torch.constant.int 1
    %int8_1394 = torch.constant.int 8
    %int128_1395 = torch.constant.int 128
    %1699 = torch.prim.ListConstruct %int1_1393, %566, %int8_1394, %int128_1395 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1700 = torch.aten.view %1694, %1699 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1700, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_1396 = torch.constant.int 131072
    %none_1397 = torch.constant.none
    %none_1398 = torch.constant.none
    %cpu_1399 = torch.constant.device "cpu"
    %false_1400 = torch.constant.bool false
    %1701 = torch.aten.arange %int131072_1396, %none_1397, %none_1398, %cpu_1399, %false_1400 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1401 = torch.constant.int 0
    %int128_1402 = torch.constant.int 128
    %none_1403 = torch.constant.none
    %none_1404 = torch.constant.none
    %cpu_1405 = torch.constant.device "cpu"
    %false_1406 = torch.constant.bool false
    %1702 = torch.aten.arange.start %int0_1401, %int128_1402, %none_1403, %none_1404, %cpu_1405, %false_1406 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_1407 = torch.constant.int 2
    %1703 = torch.aten.floor_divide.Scalar %1702, %int2_1407 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_1408 = torch.constant.int 6
    %1704 = torch.prims.convert_element_type %1703, %int6_1408 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_1409 = torch.constant.int 128
    %1705 = torch.aten.div.Scalar %1704, %int128_1409 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_1410 = torch.constant.float 2.000000e+00
    %1706 = torch.aten.mul.Scalar %1705, %float2.000000e00_1410 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_1411 = torch.constant.float 5.000000e+05
    %1707 = torch.aten.pow.Scalar %float5.000000e05_1411, %1706 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1708 = torch.aten.reciprocal %1707 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_1412 = torch.constant.float 1.000000e+00
    %1709 = torch.aten.mul.Scalar %1708, %float1.000000e00_1412 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_1413 = torch.constant.int 131072
    %int1_1414 = torch.constant.int 1
    %1710 = torch.prim.ListConstruct %int131072_1413, %int1_1414 : (!torch.int, !torch.int) -> !torch.list<int>
    %1711 = torch.aten.view %1701, %1710 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1712 = torch.aten.mul.Tensor %1711, %1709 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_1415 = torch.constant.int 1
    %1713 = torch.aten.size.int %1673, %int1_1415 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1416 = torch.constant.int 0
    %1714 = torch.aten.add.int %int0_1416, %1713 : !torch.int, !torch.int -> !torch.int
    %int0_1417 = torch.constant.int 0
    %int0_1418 = torch.constant.int 0
    %int1_1419 = torch.constant.int 1
    %1715 = torch.aten.slice.Tensor %1712, %int0_1417, %int0_1418, %1714, %int1_1419 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1715, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1420 = torch.constant.int 1
    %int0_1421 = torch.constant.int 0
    %int9223372036854775807_1422 = torch.constant.int 9223372036854775807
    %int1_1423 = torch.constant.int 1
    %1716 = torch.aten.slice.Tensor %1715, %int1_1420, %int0_1421, %int9223372036854775807_1422, %int1_1423 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1716, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1424 = torch.constant.int 1
    %int0_1425 = torch.constant.int 0
    %int9223372036854775807_1426 = torch.constant.int 9223372036854775807
    %int1_1427 = torch.constant.int 1
    %1717 = torch.aten.slice.Tensor %1716, %int1_1424, %int0_1425, %int9223372036854775807_1426, %int1_1427 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1717, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_1428 = torch.constant.int 0
    %1718 = torch.aten.unsqueeze %1717, %int0_1428 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1718, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1429 = torch.constant.int 1
    %int0_1430 = torch.constant.int 0
    %int9223372036854775807_1431 = torch.constant.int 9223372036854775807
    %int1_1432 = torch.constant.int 1
    %1719 = torch.aten.slice.Tensor %1718, %int1_1429, %int0_1430, %int9223372036854775807_1431, %int1_1432 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1719, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_1433 = torch.constant.int 2
    %int0_1434 = torch.constant.int 0
    %int9223372036854775807_1435 = torch.constant.int 9223372036854775807
    %int1_1436 = torch.constant.int 1
    %1720 = torch.aten.slice.Tensor %1719, %int2_1433, %int0_1434, %int9223372036854775807_1435, %int1_1436 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1720, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1437 = torch.constant.int 1
    %int1_1438 = torch.constant.int 1
    %int1_1439 = torch.constant.int 1
    %1721 = torch.prim.ListConstruct %int1_1437, %int1_1438, %int1_1439 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1722 = torch.aten.repeat %1720, %1721 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1722, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_1440 = torch.constant.int 6
    %1723 = torch.prims.convert_element_type %1696, %int6_1440 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1723, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1724 = torch_c.to_builtin_tensor %1723 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %1725 = torch_c.to_builtin_tensor %1722 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1726 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1724, %1725) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %1727 = torch_c.from_builtin_tensor %1726 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1727, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1441 = torch.constant.int 15
    %1728 = torch.prims.convert_element_type %1727, %int15_1441 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1728, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_1442 = torch.constant.int 131072
    %none_1443 = torch.constant.none
    %none_1444 = torch.constant.none
    %cpu_1445 = torch.constant.device "cpu"
    %false_1446 = torch.constant.bool false
    %1729 = torch.aten.arange %int131072_1442, %none_1443, %none_1444, %cpu_1445, %false_1446 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1447 = torch.constant.int 0
    %int128_1448 = torch.constant.int 128
    %none_1449 = torch.constant.none
    %none_1450 = torch.constant.none
    %cpu_1451 = torch.constant.device "cpu"
    %false_1452 = torch.constant.bool false
    %1730 = torch.aten.arange.start %int0_1447, %int128_1448, %none_1449, %none_1450, %cpu_1451, %false_1452 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_1453 = torch.constant.int 2
    %1731 = torch.aten.floor_divide.Scalar %1730, %int2_1453 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_1454 = torch.constant.int 6
    %1732 = torch.prims.convert_element_type %1731, %int6_1454 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_1455 = torch.constant.int 128
    %1733 = torch.aten.div.Scalar %1732, %int128_1455 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_1456 = torch.constant.float 2.000000e+00
    %1734 = torch.aten.mul.Scalar %1733, %float2.000000e00_1456 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_1457 = torch.constant.float 5.000000e+05
    %1735 = torch.aten.pow.Scalar %float5.000000e05_1457, %1734 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1736 = torch.aten.reciprocal %1735 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_1458 = torch.constant.float 1.000000e+00
    %1737 = torch.aten.mul.Scalar %1736, %float1.000000e00_1458 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_1459 = torch.constant.int 131072
    %int1_1460 = torch.constant.int 1
    %1738 = torch.prim.ListConstruct %int131072_1459, %int1_1460 : (!torch.int, !torch.int) -> !torch.list<int>
    %1739 = torch.aten.view %1729, %1738 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1740 = torch.aten.mul.Tensor %1739, %1737 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_1461 = torch.constant.int 1
    %1741 = torch.aten.size.int %1683, %int1_1461 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1462 = torch.constant.int 0
    %1742 = torch.aten.add.int %int0_1462, %1741 : !torch.int, !torch.int -> !torch.int
    %int0_1463 = torch.constant.int 0
    %int0_1464 = torch.constant.int 0
    %int1_1465 = torch.constant.int 1
    %1743 = torch.aten.slice.Tensor %1740, %int0_1463, %int0_1464, %1742, %int1_1465 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1743, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1466 = torch.constant.int 1
    %int0_1467 = torch.constant.int 0
    %int9223372036854775807_1468 = torch.constant.int 9223372036854775807
    %int1_1469 = torch.constant.int 1
    %1744 = torch.aten.slice.Tensor %1743, %int1_1466, %int0_1467, %int9223372036854775807_1468, %int1_1469 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1744, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1470 = torch.constant.int 1
    %int0_1471 = torch.constant.int 0
    %int9223372036854775807_1472 = torch.constant.int 9223372036854775807
    %int1_1473 = torch.constant.int 1
    %1745 = torch.aten.slice.Tensor %1744, %int1_1470, %int0_1471, %int9223372036854775807_1472, %int1_1473 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1745, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_1474 = torch.constant.int 0
    %1746 = torch.aten.unsqueeze %1745, %int0_1474 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1746, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1475 = torch.constant.int 1
    %int0_1476 = torch.constant.int 0
    %int9223372036854775807_1477 = torch.constant.int 9223372036854775807
    %int1_1478 = torch.constant.int 1
    %1747 = torch.aten.slice.Tensor %1746, %int1_1475, %int0_1476, %int9223372036854775807_1477, %int1_1478 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1747, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_1479 = torch.constant.int 2
    %int0_1480 = torch.constant.int 0
    %int9223372036854775807_1481 = torch.constant.int 9223372036854775807
    %int1_1482 = torch.constant.int 1
    %1748 = torch.aten.slice.Tensor %1747, %int2_1479, %int0_1480, %int9223372036854775807_1481, %int1_1482 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1748, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1483 = torch.constant.int 1
    %int1_1484 = torch.constant.int 1
    %int1_1485 = torch.constant.int 1
    %1749 = torch.prim.ListConstruct %int1_1483, %int1_1484, %int1_1485 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1750 = torch.aten.repeat %1748, %1749 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1750, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_1486 = torch.constant.int 6
    %1751 = torch.prims.convert_element_type %1698, %int6_1486 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1751, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %1752 = torch_c.to_builtin_tensor %1751 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %1753 = torch_c.to_builtin_tensor %1750 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1754 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1752, %1753) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %1755 = torch_c.from_builtin_tensor %1754 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1755, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_1487 = torch.constant.int 15
    %1756 = torch.prims.convert_element_type %1755, %int15_1487 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1756, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1757 = torch.aten.div.Tensor %1756, %93 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1757, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1488 = torch.constant.float -2.400000e+02
    %float2.400000e02_1489 = torch.constant.float 2.400000e+02
    %1758 = torch.aten.clamp %1757, %float-2.400000e02_1488, %float2.400000e02_1489 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1758, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1490 = torch.constant.int 26
    %1759 = torch.prims.convert_element_type %1758, %int26_1490 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1759, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1760 = torch.aten.div.Tensor %1700, %93 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1760, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1491 = torch.constant.float -2.400000e+02
    %float2.400000e02_1492 = torch.constant.float 2.400000e+02
    %1761 = torch.aten.clamp %1760, %float-2.400000e02_1491, %float2.400000e02_1492 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1761, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1493 = torch.constant.int 26
    %1762 = torch.prims.convert_element_type %1761, %int26_1493 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1762, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_1494 = torch.constant.int 64
    %1763 = torch.aten.mul.Scalar %arg2, %int64_1494 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1763, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int10 = torch.constant.int 10
    %int1_1495 = torch.constant.int 1
    %1764 = torch.aten.add.Scalar %1763, %int10, %int1_1495 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1764, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_1496 = torch.constant.int 1
    %int32_1497 = torch.constant.int 32
    %int8_1498 = torch.constant.int 8
    %int128_1499 = torch.constant.int 128
    %1765 = torch.prim.ListConstruct %int1_1496, %670, %int32_1497, %int8_1498, %int128_1499 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1766 = torch.aten.view %1759, %1765 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1766, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1500 = torch.constant.int 32
    %int8_1501 = torch.constant.int 8
    %int128_1502 = torch.constant.int 128
    %1767 = torch.prim.ListConstruct %670, %int32_1500, %int8_1501, %int128_1502 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1768 = torch.aten.view %1766, %1767 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1768, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1769 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1770 = torch.aten.view %1764, %1769 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1770, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_1503 = torch.constant.int 32
    %int2_1504 = torch.constant.int 2
    %int32_1505 = torch.constant.int 32
    %int8_1506 = torch.constant.int 8
    %int128_1507 = torch.constant.int 128
    %1771 = torch.prim.ListConstruct %661, %int32_1503, %int2_1504, %int32_1505, %int8_1506, %int128_1507 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1772 = torch.aten.view %1579, %1771 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1772, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_1508 = torch.constant.int 32
    %1773 = torch.aten.mul.int %661, %int32_1508 : !torch.int, !torch.int -> !torch.int
    %int2_1509 = torch.constant.int 2
    %1774 = torch.aten.mul.int %1773, %int2_1509 : !torch.int, !torch.int -> !torch.int
    %int32_1510 = torch.constant.int 32
    %int8_1511 = torch.constant.int 8
    %int128_1512 = torch.constant.int 128
    %1775 = torch.prim.ListConstruct %1774, %int32_1510, %int8_1511, %int128_1512 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1776 = torch.aten.view %1772, %1775 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1776, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %1777 = torch.prim.ListConstruct %1770 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1513 = torch.constant.bool false
    %1778 = torch.aten.index_put %1776, %1777, %1768, %false_1513 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1778, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_1514 = torch.constant.int 32
    %int2_1515 = torch.constant.int 2
    %int32_1516 = torch.constant.int 32
    %int8_1517 = torch.constant.int 8
    %int128_1518 = torch.constant.int 128
    %1779 = torch.prim.ListConstruct %661, %int32_1514, %int2_1515, %int32_1516, %int8_1517, %int128_1518 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1780 = torch.aten.view %1778, %1779 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1780, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_1519 = torch.constant.int 2097152
    %1781 = torch.prim.ListConstruct %661, %int2097152_1519 : (!torch.int, !torch.int) -> !torch.list<int>
    %1782 = torch.aten.view %1780, %1781 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1782, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_1520 = torch.constant.int 32
    %int2_1521 = torch.constant.int 2
    %int32_1522 = torch.constant.int 32
    %int8_1523 = torch.constant.int 8
    %int128_1524 = torch.constant.int 128
    %1783 = torch.prim.ListConstruct %661, %int32_1520, %int2_1521, %int32_1522, %int8_1523, %int128_1524 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1784 = torch.aten.view %1782, %1783 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1784, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_1525 = torch.constant.int 32
    %int8_1526 = torch.constant.int 8
    %int128_1527 = torch.constant.int 128
    %1785 = torch.prim.ListConstruct %1774, %int32_1525, %int8_1526, %int128_1527 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1786 = torch.aten.view %1784, %1785 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1786, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_1528 = torch.constant.int 1
    %int32_1529 = torch.constant.int 32
    %int8_1530 = torch.constant.int 8
    %int128_1531 = torch.constant.int 128
    %1787 = torch.prim.ListConstruct %int1_1528, %670, %int32_1529, %int8_1530, %int128_1531 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1788 = torch.aten.view %1762, %1787 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1788, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1532 = torch.constant.int 32
    %int8_1533 = torch.constant.int 8
    %int128_1534 = torch.constant.int 128
    %1789 = torch.prim.ListConstruct %670, %int32_1532, %int8_1533, %int128_1534 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1790 = torch.aten.view %1788, %1789 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1790, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1535 = torch.constant.int 1
    %int1_1536 = torch.constant.int 1
    %1791 = torch.aten.add.Scalar %1764, %int1_1535, %int1_1536 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1791, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1792 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1793 = torch.aten.view %1791, %1792 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1793, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1794 = torch.prim.ListConstruct %1793 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1537 = torch.constant.bool false
    %1795 = torch.aten.index_put %1786, %1794, %1790, %false_1537 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1795, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_1538 = torch.constant.int 32
    %int2_1539 = torch.constant.int 2
    %int32_1540 = torch.constant.int 32
    %int8_1541 = torch.constant.int 8
    %int128_1542 = torch.constant.int 128
    %1796 = torch.prim.ListConstruct %661, %int32_1538, %int2_1539, %int32_1540, %int8_1541, %int128_1542 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1797 = torch.aten.view %1795, %1796 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1797, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_1543 = torch.constant.int 2097152
    %1798 = torch.prim.ListConstruct %661, %int2097152_1543 : (!torch.int, !torch.int) -> !torch.list<int>
    %1799 = torch.aten.view %1797, %1798 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %1799, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_1544 = torch.constant.int -2
    %1800 = torch.aten.unsqueeze %1759, %int-2_1544 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1800, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1545 = torch.constant.int 1
    %int8_1546 = torch.constant.int 8
    %int4_1547 = torch.constant.int 4
    %int128_1548 = torch.constant.int 128
    %1801 = torch.prim.ListConstruct %int1_1545, %1741, %int8_1546, %int4_1547, %int128_1548 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1549 = torch.constant.bool false
    %1802 = torch.aten.expand %1800, %1801, %false_1549 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1802, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1550 = torch.constant.int 0
    %1803 = torch.aten.clone %1802, %int0_1550 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1803, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1551 = torch.constant.int 1
    %int32_1552 = torch.constant.int 32
    %int128_1553 = torch.constant.int 128
    %1804 = torch.prim.ListConstruct %int1_1551, %1741, %int32_1552, %int128_1553 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1805 = torch.aten._unsafe_view %1803, %1804 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1805, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_1554 = torch.constant.int -2
    %1806 = torch.aten.unsqueeze %1762, %int-2_1554 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1806, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1555 = torch.constant.int 1
    %1807 = torch.aten.size.int %1693, %int1_1555 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_1556 = torch.constant.int 1
    %int8_1557 = torch.constant.int 8
    %int4_1558 = torch.constant.int 4
    %int128_1559 = torch.constant.int 128
    %1808 = torch.prim.ListConstruct %int1_1556, %1807, %int8_1557, %int4_1558, %int128_1559 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1560 = torch.constant.bool false
    %1809 = torch.aten.expand %1806, %1808, %false_1560 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1809, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1561 = torch.constant.int 0
    %1810 = torch.aten.clone %1809, %int0_1561 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1810, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1562 = torch.constant.int 1
    %int32_1563 = torch.constant.int 32
    %int128_1564 = torch.constant.int 128
    %1811 = torch.prim.ListConstruct %int1_1562, %1807, %int32_1563, %int128_1564 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1812 = torch.aten._unsafe_view %1810, %1811 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1812, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_1565 = torch.constant.int 6
    %1813 = torch.prims.convert_element_type %1805, %int6_1565 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1813, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1814 = torch.aten.mul.Tensor %1813, %93 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1814, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1566 = torch.constant.int 15
    %1815 = torch.prims.convert_element_type %1814, %int15_1566 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1815, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_1567 = torch.constant.int 6
    %1816 = torch.prims.convert_element_type %1812, %int6_1567 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1816, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1817 = torch.aten.mul.Tensor %1816, %93 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1817, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1568 = torch.constant.int 15
    %1818 = torch.prims.convert_element_type %1817, %int15_1568 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1818, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1569 = torch.constant.int 1
    %int2_1570 = torch.constant.int 2
    %1819 = torch.aten.transpose.int %1728, %int1_1569, %int2_1570 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1819, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1571 = torch.constant.int 1
    %int2_1572 = torch.constant.int 2
    %1820 = torch.aten.transpose.int %1815, %int1_1571, %int2_1572 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1820, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1573 = torch.constant.int 1
    %int2_1574 = torch.constant.int 2
    %1821 = torch.aten.transpose.int %1818, %int1_1573, %int2_1574 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1821, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_1575 = torch.constant.float 0.000000e+00
    %true_1576 = torch.constant.bool true
    %none_1577 = torch.constant.none
    %none_1578 = torch.constant.none
    %1822:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1819, %1820, %1821, %float0.000000e00_1575, %true_1576, %none_1577, %none_1578) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %1822#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1579 = torch.constant.int 1
    %int2_1580 = torch.constant.int 2
    %1823 = torch.aten.transpose.int %1822#0, %int1_1579, %int2_1580 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1823, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1581 = torch.constant.int 1
    %int4096_1582 = torch.constant.int 4096
    %1824 = torch.prim.ListConstruct %int1_1581, %1713, %int4096_1582 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1825 = torch.aten.view %1823, %1824 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1825, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1826 = torch.aten.div.Tensor %1825, %94 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1826, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1583 = torch.constant.float -2.400000e+02
    %float2.400000e02_1584 = torch.constant.float 2.400000e+02
    %1827 = torch.aten.clamp %1826, %float-2.400000e02_1583, %float2.400000e02_1584 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1827, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1585 = torch.constant.int 26
    %1828 = torch.prims.convert_element_type %1827, %int26_1585 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1828, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1586 = torch.constant.int -2
    %int-1_1587 = torch.constant.int -1
    %1829 = torch.aten.transpose.int %95, %int-2_1586, %int-1_1587 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1588 = torch.constant.int 4096
    %1830 = torch.prim.ListConstruct %1713, %int4096_1588 : (!torch.int, !torch.int) -> !torch.list<int>
    %1831 = torch.aten.view %1828, %1830 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1831, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1832 = torch.aten.mm %1831, %1829 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1832, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1589 = torch.constant.int 1
    %int4096_1590 = torch.constant.int 4096
    %1833 = torch.prim.ListConstruct %int1_1589, %1713, %int4096_1590 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1834 = torch.aten.view %1832, %1833 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1834, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1591 = torch.constant.int 15
    %1835 = torch.prims.convert_element_type %1834, %int15_1591 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1835, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1592 = torch.constant.int 1
    %1836 = torch.aten.add.Tensor %1657, %1835, %int1_1592 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1836, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1593 = torch.constant.int 2
    %1837 = torch.aten.pow.Tensor_Scalar %1836, %int2_1593 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1837, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1594 = torch.constant.int -1
    %1838 = torch.prim.ListConstruct %int-1_1594 : (!torch.int) -> !torch.list<int>
    %true_1595 = torch.constant.bool true
    %none_1596 = torch.constant.none
    %1839 = torch.aten.mean.dim %1837, %1838, %true_1595, %none_1596 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1839, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1597 = torch.constant.float 1.000000e-05
    %int1_1598 = torch.constant.int 1
    %1840 = torch.aten.add.Scalar %1839, %float1.000000e-05_1597, %int1_1598 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1840, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1841 = torch.aten.rsqrt %1840 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1841, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1842 = torch.aten.mul.Tensor %1836, %1841 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1842, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1843 = torch.aten.mul.Tensor %96, %1842 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1843, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1844 = torch.aten.div.Tensor %1843, %97 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1844, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1599 = torch.constant.float -2.400000e+02
    %float2.400000e02_1600 = torch.constant.float 2.400000e+02
    %1845 = torch.aten.clamp %1844, %float-2.400000e02_1599, %float2.400000e02_1600 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1845, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1601 = torch.constant.int 26
    %1846 = torch.prims.convert_element_type %1845, %int26_1601 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1846, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1602 = torch.constant.int -2
    %int-1_1603 = torch.constant.int -1
    %1847 = torch.aten.transpose.int %98, %int-2_1602, %int-1_1603 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1604 = torch.constant.int 4096
    %1848 = torch.prim.ListConstruct %566, %int4096_1604 : (!torch.int, !torch.int) -> !torch.list<int>
    %1849 = torch.aten.view %1846, %1848 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1849, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1850 = torch.aten.mm %1849, %1847 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1850, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1605 = torch.constant.int 1
    %int14336_1606 = torch.constant.int 14336
    %1851 = torch.prim.ListConstruct %int1_1605, %566, %int14336_1606 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1852 = torch.aten.view %1850, %1851 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1852, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1607 = torch.constant.int 15
    %1853 = torch.prims.convert_element_type %1852, %int15_1607 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1853, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1854 = torch.aten.silu %1853 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1854, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1855 = torch.aten.div.Tensor %1843, %99 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1855, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1608 = torch.constant.float -2.400000e+02
    %float2.400000e02_1609 = torch.constant.float 2.400000e+02
    %1856 = torch.aten.clamp %1855, %float-2.400000e02_1608, %float2.400000e02_1609 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1856, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1610 = torch.constant.int 26
    %1857 = torch.prims.convert_element_type %1856, %int26_1610 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1857, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1611 = torch.constant.int -2
    %int-1_1612 = torch.constant.int -1
    %1858 = torch.aten.transpose.int %100, %int-2_1611, %int-1_1612 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1613 = torch.constant.int 4096
    %1859 = torch.prim.ListConstruct %566, %int4096_1613 : (!torch.int, !torch.int) -> !torch.list<int>
    %1860 = torch.aten.view %1857, %1859 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1860, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1861 = torch.aten.mm %1860, %1858 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1861, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1614 = torch.constant.int 1
    %int14336_1615 = torch.constant.int 14336
    %1862 = torch.prim.ListConstruct %int1_1614, %566, %int14336_1615 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1863 = torch.aten.view %1861, %1862 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1863, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1616 = torch.constant.int 15
    %1864 = torch.prims.convert_element_type %1863, %int15_1616 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1864, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1865 = torch.aten.mul.Tensor %1854, %1864 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1865, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1866 = torch.aten.div.Tensor %1865, %101 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1866, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_1617 = torch.constant.float -2.400000e+02
    %float2.400000e02_1618 = torch.constant.float 2.400000e+02
    %1867 = torch.aten.clamp %1866, %float-2.400000e02_1617, %float2.400000e02_1618 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1867, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_1619 = torch.constant.int 26
    %1868 = torch.prims.convert_element_type %1867, %int26_1619 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1868, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_1620 = torch.constant.int -2
    %int-1_1621 = torch.constant.int -1
    %1869 = torch.aten.transpose.int %102, %int-2_1620, %int-1_1621 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_1622 = torch.constant.int 1
    %1870 = torch.aten.size.int %1852, %int1_1622 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_1623 = torch.constant.int 14336
    %1871 = torch.prim.ListConstruct %1870, %int14336_1623 : (!torch.int, !torch.int) -> !torch.list<int>
    %1872 = torch.aten.view %1868, %1871 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1872, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %1873 = torch.aten.mm %1872, %1869 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1873, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1624 = torch.constant.int 1
    %int4096_1625 = torch.constant.int 4096
    %1874 = torch.prim.ListConstruct %int1_1624, %1870, %int4096_1625 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1875 = torch.aten.view %1873, %1874 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1875, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1626 = torch.constant.int 15
    %1876 = torch.prims.convert_element_type %1875, %int15_1626 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1876, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1627 = torch.constant.int 1
    %1877 = torch.aten.add.Tensor %1836, %1876, %int1_1627 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1877, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1628 = torch.constant.int 2
    %1878 = torch.aten.pow.Tensor_Scalar %1877, %int2_1628 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1878, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1629 = torch.constant.int -1
    %1879 = torch.prim.ListConstruct %int-1_1629 : (!torch.int) -> !torch.list<int>
    %true_1630 = torch.constant.bool true
    %none_1631 = torch.constant.none
    %1880 = torch.aten.mean.dim %1878, %1879, %true_1630, %none_1631 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1880, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1632 = torch.constant.float 1.000000e-05
    %int1_1633 = torch.constant.int 1
    %1881 = torch.aten.add.Scalar %1880, %float1.000000e-05_1632, %int1_1633 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1881, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1882 = torch.aten.rsqrt %1881 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1882, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1883 = torch.aten.mul.Tensor %1877, %1882 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1883, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1884 = torch.aten.mul.Tensor %103, %1883 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1884, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %1885 = torch.aten.div.Tensor %1884, %104 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1885, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1634 = torch.constant.float -2.400000e+02
    %float2.400000e02_1635 = torch.constant.float 2.400000e+02
    %1886 = torch.aten.clamp %1885, %float-2.400000e02_1634, %float2.400000e02_1635 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1886, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1636 = torch.constant.int 26
    %1887 = torch.prims.convert_element_type %1886, %int26_1636 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1887, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1637 = torch.constant.int -2
    %int-1_1638 = torch.constant.int -1
    %1888 = torch.aten.transpose.int %105, %int-2_1637, %int-1_1638 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1639 = torch.constant.int 4096
    %1889 = torch.prim.ListConstruct %566, %int4096_1639 : (!torch.int, !torch.int) -> !torch.list<int>
    %1890 = torch.aten.view %1887, %1889 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1890, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1891 = torch.aten.mm %1890, %1888 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1891, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1640 = torch.constant.int 1
    %int4096_1641 = torch.constant.int 4096
    %1892 = torch.prim.ListConstruct %int1_1640, %566, %int4096_1641 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1893 = torch.aten.view %1891, %1892 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1893, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1642 = torch.constant.int 15
    %1894 = torch.prims.convert_element_type %1893, %int15_1642 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1894, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1895 = torch.aten.div.Tensor %1884, %106 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1895, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1643 = torch.constant.float -2.400000e+02
    %float2.400000e02_1644 = torch.constant.float 2.400000e+02
    %1896 = torch.aten.clamp %1895, %float-2.400000e02_1643, %float2.400000e02_1644 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1896, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1645 = torch.constant.int 26
    %1897 = torch.prims.convert_element_type %1896, %int26_1645 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1897, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1646 = torch.constant.int -2
    %int-1_1647 = torch.constant.int -1
    %1898 = torch.aten.transpose.int %107, %int-2_1646, %int-1_1647 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1648 = torch.constant.int 4096
    %1899 = torch.prim.ListConstruct %566, %int4096_1648 : (!torch.int, !torch.int) -> !torch.list<int>
    %1900 = torch.aten.view %1897, %1899 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1900, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1901 = torch.aten.mm %1900, %1898 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1901, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1649 = torch.constant.int 1
    %int1024_1650 = torch.constant.int 1024
    %1902 = torch.prim.ListConstruct %int1_1649, %566, %int1024_1650 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1903 = torch.aten.view %1901, %1902 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1903, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1651 = torch.constant.int 15
    %1904 = torch.prims.convert_element_type %1903, %int15_1651 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1904, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1905 = torch.aten.div.Tensor %1884, %108 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1905, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1652 = torch.constant.float -2.400000e+02
    %float2.400000e02_1653 = torch.constant.float 2.400000e+02
    %1906 = torch.aten.clamp %1905, %float-2.400000e02_1652, %float2.400000e02_1653 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1906, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1654 = torch.constant.int 26
    %1907 = torch.prims.convert_element_type %1906, %int26_1654 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1907, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1655 = torch.constant.int -2
    %int-1_1656 = torch.constant.int -1
    %1908 = torch.aten.transpose.int %109, %int-2_1655, %int-1_1656 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1657 = torch.constant.int 4096
    %1909 = torch.prim.ListConstruct %566, %int4096_1657 : (!torch.int, !torch.int) -> !torch.list<int>
    %1910 = torch.aten.view %1907, %1909 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1910, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1911 = torch.aten.mm %1910, %1908 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1911, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1658 = torch.constant.int 1
    %int1024_1659 = torch.constant.int 1024
    %1912 = torch.prim.ListConstruct %int1_1658, %566, %int1024_1659 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1913 = torch.aten.view %1911, %1912 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1913, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1660 = torch.constant.int 15
    %1914 = torch.prims.convert_element_type %1913, %int15_1660 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1914, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_1661 = torch.constant.int 1
    %int32_1662 = torch.constant.int 32
    %int128_1663 = torch.constant.int 128
    %1915 = torch.prim.ListConstruct %int1_1661, %566, %int32_1662, %int128_1663 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1916 = torch.aten.view %1894, %1915 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1916, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1664 = torch.constant.int 1
    %int8_1665 = torch.constant.int 8
    %int128_1666 = torch.constant.int 128
    %1917 = torch.prim.ListConstruct %int1_1664, %566, %int8_1665, %int128_1666 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1918 = torch.aten.view %1904, %1917 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1918, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_1667 = torch.constant.int 1
    %int8_1668 = torch.constant.int 8
    %int128_1669 = torch.constant.int 128
    %1919 = torch.prim.ListConstruct %int1_1667, %566, %int8_1668, %int128_1669 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1920 = torch.aten.view %1914, %1919 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1920, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_1670 = torch.constant.int 131072
    %none_1671 = torch.constant.none
    %none_1672 = torch.constant.none
    %cpu_1673 = torch.constant.device "cpu"
    %false_1674 = torch.constant.bool false
    %1921 = torch.aten.arange %int131072_1670, %none_1671, %none_1672, %cpu_1673, %false_1674 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1675 = torch.constant.int 0
    %int128_1676 = torch.constant.int 128
    %none_1677 = torch.constant.none
    %none_1678 = torch.constant.none
    %cpu_1679 = torch.constant.device "cpu"
    %false_1680 = torch.constant.bool false
    %1922 = torch.aten.arange.start %int0_1675, %int128_1676, %none_1677, %none_1678, %cpu_1679, %false_1680 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_1681 = torch.constant.int 2
    %1923 = torch.aten.floor_divide.Scalar %1922, %int2_1681 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_1682 = torch.constant.int 6
    %1924 = torch.prims.convert_element_type %1923, %int6_1682 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_1683 = torch.constant.int 128
    %1925 = torch.aten.div.Scalar %1924, %int128_1683 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_1684 = torch.constant.float 2.000000e+00
    %1926 = torch.aten.mul.Scalar %1925, %float2.000000e00_1684 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_1685 = torch.constant.float 5.000000e+05
    %1927 = torch.aten.pow.Scalar %float5.000000e05_1685, %1926 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1928 = torch.aten.reciprocal %1927 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_1686 = torch.constant.float 1.000000e+00
    %1929 = torch.aten.mul.Scalar %1928, %float1.000000e00_1686 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_1687 = torch.constant.int 131072
    %int1_1688 = torch.constant.int 1
    %1930 = torch.prim.ListConstruct %int131072_1687, %int1_1688 : (!torch.int, !torch.int) -> !torch.list<int>
    %1931 = torch.aten.view %1921, %1930 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1932 = torch.aten.mul.Tensor %1931, %1929 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_1689 = torch.constant.int 1
    %1933 = torch.aten.size.int %1893, %int1_1689 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1690 = torch.constant.int 0
    %1934 = torch.aten.add.int %int0_1690, %1933 : !torch.int, !torch.int -> !torch.int
    %int0_1691 = torch.constant.int 0
    %int0_1692 = torch.constant.int 0
    %int1_1693 = torch.constant.int 1
    %1935 = torch.aten.slice.Tensor %1932, %int0_1691, %int0_1692, %1934, %int1_1693 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1935, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1694 = torch.constant.int 1
    %int0_1695 = torch.constant.int 0
    %int9223372036854775807_1696 = torch.constant.int 9223372036854775807
    %int1_1697 = torch.constant.int 1
    %1936 = torch.aten.slice.Tensor %1935, %int1_1694, %int0_1695, %int9223372036854775807_1696, %int1_1697 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1936, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1698 = torch.constant.int 1
    %int0_1699 = torch.constant.int 0
    %int9223372036854775807_1700 = torch.constant.int 9223372036854775807
    %int1_1701 = torch.constant.int 1
    %1937 = torch.aten.slice.Tensor %1936, %int1_1698, %int0_1699, %int9223372036854775807_1700, %int1_1701 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1937, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_1702 = torch.constant.int 0
    %1938 = torch.aten.unsqueeze %1937, %int0_1702 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1938, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1703 = torch.constant.int 1
    %int0_1704 = torch.constant.int 0
    %int9223372036854775807_1705 = torch.constant.int 9223372036854775807
    %int1_1706 = torch.constant.int 1
    %1939 = torch.aten.slice.Tensor %1938, %int1_1703, %int0_1704, %int9223372036854775807_1705, %int1_1706 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1939, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_1707 = torch.constant.int 2
    %int0_1708 = torch.constant.int 0
    %int9223372036854775807_1709 = torch.constant.int 9223372036854775807
    %int1_1710 = torch.constant.int 1
    %1940 = torch.aten.slice.Tensor %1939, %int2_1707, %int0_1708, %int9223372036854775807_1709, %int1_1710 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1940, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1711 = torch.constant.int 1
    %int1_1712 = torch.constant.int 1
    %int1_1713 = torch.constant.int 1
    %1941 = torch.prim.ListConstruct %int1_1711, %int1_1712, %int1_1713 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1942 = torch.aten.repeat %1940, %1941 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1942, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_1714 = torch.constant.int 6
    %1943 = torch.prims.convert_element_type %1916, %int6_1714 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1943, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1944 = torch_c.to_builtin_tensor %1943 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %1945 = torch_c.to_builtin_tensor %1942 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1946 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1944, %1945) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %1947 = torch_c.from_builtin_tensor %1946 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1947, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1715 = torch.constant.int 15
    %1948 = torch.prims.convert_element_type %1947, %int15_1715 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1948, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_1716 = torch.constant.int 131072
    %none_1717 = torch.constant.none
    %none_1718 = torch.constant.none
    %cpu_1719 = torch.constant.device "cpu"
    %false_1720 = torch.constant.bool false
    %1949 = torch.aten.arange %int131072_1716, %none_1717, %none_1718, %cpu_1719, %false_1720 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1721 = torch.constant.int 0
    %int128_1722 = torch.constant.int 128
    %none_1723 = torch.constant.none
    %none_1724 = torch.constant.none
    %cpu_1725 = torch.constant.device "cpu"
    %false_1726 = torch.constant.bool false
    %1950 = torch.aten.arange.start %int0_1721, %int128_1722, %none_1723, %none_1724, %cpu_1725, %false_1726 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_1727 = torch.constant.int 2
    %1951 = torch.aten.floor_divide.Scalar %1950, %int2_1727 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_1728 = torch.constant.int 6
    %1952 = torch.prims.convert_element_type %1951, %int6_1728 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_1729 = torch.constant.int 128
    %1953 = torch.aten.div.Scalar %1952, %int128_1729 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_1730 = torch.constant.float 2.000000e+00
    %1954 = torch.aten.mul.Scalar %1953, %float2.000000e00_1730 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_1731 = torch.constant.float 5.000000e+05
    %1955 = torch.aten.pow.Scalar %float5.000000e05_1731, %1954 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %1956 = torch.aten.reciprocal %1955 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_1732 = torch.constant.float 1.000000e+00
    %1957 = torch.aten.mul.Scalar %1956, %float1.000000e00_1732 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_1733 = torch.constant.int 131072
    %int1_1734 = torch.constant.int 1
    %1958 = torch.prim.ListConstruct %int131072_1733, %int1_1734 : (!torch.int, !torch.int) -> !torch.list<int>
    %1959 = torch.aten.view %1949, %1958 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %1960 = torch.aten.mul.Tensor %1959, %1957 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_1735 = torch.constant.int 1
    %1961 = torch.aten.size.int %1903, %int1_1735 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1736 = torch.constant.int 0
    %1962 = torch.aten.add.int %int0_1736, %1961 : !torch.int, !torch.int -> !torch.int
    %int0_1737 = torch.constant.int 0
    %int0_1738 = torch.constant.int 0
    %int1_1739 = torch.constant.int 1
    %1963 = torch.aten.slice.Tensor %1960, %int0_1737, %int0_1738, %1962, %int1_1739 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1963, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1740 = torch.constant.int 1
    %int0_1741 = torch.constant.int 0
    %int9223372036854775807_1742 = torch.constant.int 9223372036854775807
    %int1_1743 = torch.constant.int 1
    %1964 = torch.aten.slice.Tensor %1963, %int1_1740, %int0_1741, %int9223372036854775807_1742, %int1_1743 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1964, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1744 = torch.constant.int 1
    %int0_1745 = torch.constant.int 0
    %int9223372036854775807_1746 = torch.constant.int 9223372036854775807
    %int1_1747 = torch.constant.int 1
    %1965 = torch.aten.slice.Tensor %1964, %int1_1744, %int0_1745, %int9223372036854775807_1746, %int1_1747 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %1965, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_1748 = torch.constant.int 0
    %1966 = torch.aten.unsqueeze %1965, %int0_1748 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1966, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1749 = torch.constant.int 1
    %int0_1750 = torch.constant.int 0
    %int9223372036854775807_1751 = torch.constant.int 9223372036854775807
    %int1_1752 = torch.constant.int 1
    %1967 = torch.aten.slice.Tensor %1966, %int1_1749, %int0_1750, %int9223372036854775807_1751, %int1_1752 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1967, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_1753 = torch.constant.int 2
    %int0_1754 = torch.constant.int 0
    %int9223372036854775807_1755 = torch.constant.int 9223372036854775807
    %int1_1756 = torch.constant.int 1
    %1968 = torch.aten.slice.Tensor %1967, %int2_1753, %int0_1754, %int9223372036854775807_1755, %int1_1756 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1968, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1757 = torch.constant.int 1
    %int1_1758 = torch.constant.int 1
    %int1_1759 = torch.constant.int 1
    %1969 = torch.prim.ListConstruct %int1_1757, %int1_1758, %int1_1759 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1970 = torch.aten.repeat %1968, %1969 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %1970, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_1760 = torch.constant.int 6
    %1971 = torch.prims.convert_element_type %1918, %int6_1760 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1971, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %1972 = torch_c.to_builtin_tensor %1971 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %1973 = torch_c.to_builtin_tensor %1970 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %1974 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1972, %1973) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %1975 = torch_c.from_builtin_tensor %1974 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %1975, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_1761 = torch.constant.int 15
    %1976 = torch.prims.convert_element_type %1975, %int15_1761 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1976, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1977 = torch.aten.div.Tensor %1976, %110 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1977, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1762 = torch.constant.float -2.400000e+02
    %float2.400000e02_1763 = torch.constant.float 2.400000e+02
    %1978 = torch.aten.clamp %1977, %float-2.400000e02_1762, %float2.400000e02_1763 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1978, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1764 = torch.constant.int 26
    %1979 = torch.prims.convert_element_type %1978, %int26_1764 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1979, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1980 = torch.aten.div.Tensor %1920, %110 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1980, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1765 = torch.constant.float -2.400000e+02
    %float2.400000e02_1766 = torch.constant.float 2.400000e+02
    %1981 = torch.aten.clamp %1980, %float-2.400000e02_1765, %float2.400000e02_1766 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1981, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1767 = torch.constant.int 26
    %1982 = torch.prims.convert_element_type %1981, %int26_1767 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1982, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_1768 = torch.constant.int 64
    %1983 = torch.aten.mul.Scalar %arg2, %int64_1768 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1983, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int12 = torch.constant.int 12
    %int1_1769 = torch.constant.int 1
    %1984 = torch.aten.add.Scalar %1983, %int12, %int1_1769 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1984, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_1770 = torch.constant.int 1
    %int32_1771 = torch.constant.int 32
    %int8_1772 = torch.constant.int 8
    %int128_1773 = torch.constant.int 128
    %1985 = torch.prim.ListConstruct %int1_1770, %670, %int32_1771, %int8_1772, %int128_1773 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1986 = torch.aten.view %1979, %1985 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1986, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1774 = torch.constant.int 32
    %int8_1775 = torch.constant.int 8
    %int128_1776 = torch.constant.int 128
    %1987 = torch.prim.ListConstruct %670, %int32_1774, %int8_1775, %int128_1776 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1988 = torch.aten.view %1986, %1987 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1988, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1989 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %1990 = torch.aten.view %1984, %1989 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1990, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_1777 = torch.constant.int 32
    %int2_1778 = torch.constant.int 2
    %int32_1779 = torch.constant.int 32
    %int8_1780 = torch.constant.int 8
    %int128_1781 = torch.constant.int 128
    %1991 = torch.prim.ListConstruct %661, %int32_1777, %int2_1778, %int32_1779, %int8_1780, %int128_1781 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1992 = torch.aten.view %1799, %1991 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %1992, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_1782 = torch.constant.int 32
    %1993 = torch.aten.mul.int %661, %int32_1782 : !torch.int, !torch.int -> !torch.int
    %int2_1783 = torch.constant.int 2
    %1994 = torch.aten.mul.int %1993, %int2_1783 : !torch.int, !torch.int -> !torch.int
    %int32_1784 = torch.constant.int 32
    %int8_1785 = torch.constant.int 8
    %int128_1786 = torch.constant.int 128
    %1995 = torch.prim.ListConstruct %1994, %int32_1784, %int8_1785, %int128_1786 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1996 = torch.aten.view %1992, %1995 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1996, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %1997 = torch.prim.ListConstruct %1990 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1787 = torch.constant.bool false
    %1998 = torch.aten.index_put %1996, %1997, %1988, %false_1787 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %1998, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_1788 = torch.constant.int 32
    %int2_1789 = torch.constant.int 2
    %int32_1790 = torch.constant.int 32
    %int8_1791 = torch.constant.int 8
    %int128_1792 = torch.constant.int 128
    %1999 = torch.prim.ListConstruct %661, %int32_1788, %int2_1789, %int32_1790, %int8_1791, %int128_1792 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2000 = torch.aten.view %1998, %1999 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2000, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_1793 = torch.constant.int 2097152
    %2001 = torch.prim.ListConstruct %661, %int2097152_1793 : (!torch.int, !torch.int) -> !torch.list<int>
    %2002 = torch.aten.view %2000, %2001 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2002, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_1794 = torch.constant.int 32
    %int2_1795 = torch.constant.int 2
    %int32_1796 = torch.constant.int 32
    %int8_1797 = torch.constant.int 8
    %int128_1798 = torch.constant.int 128
    %2003 = torch.prim.ListConstruct %661, %int32_1794, %int2_1795, %int32_1796, %int8_1797, %int128_1798 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2004 = torch.aten.view %2002, %2003 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2004, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_1799 = torch.constant.int 32
    %int8_1800 = torch.constant.int 8
    %int128_1801 = torch.constant.int 128
    %2005 = torch.prim.ListConstruct %1994, %int32_1799, %int8_1800, %int128_1801 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2006 = torch.aten.view %2004, %2005 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2006, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_1802 = torch.constant.int 1
    %int32_1803 = torch.constant.int 32
    %int8_1804 = torch.constant.int 8
    %int128_1805 = torch.constant.int 128
    %2007 = torch.prim.ListConstruct %int1_1802, %670, %int32_1803, %int8_1804, %int128_1805 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2008 = torch.aten.view %1982, %2007 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2008, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1806 = torch.constant.int 32
    %int8_1807 = torch.constant.int 8
    %int128_1808 = torch.constant.int 128
    %2009 = torch.prim.ListConstruct %670, %int32_1806, %int8_1807, %int128_1808 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2010 = torch.aten.view %2008, %2009 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2010, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1809 = torch.constant.int 1
    %int1_1810 = torch.constant.int 1
    %2011 = torch.aten.add.Scalar %1984, %int1_1809, %int1_1810 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2011, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2012 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2013 = torch.aten.view %2011, %2012 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2013, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2014 = torch.prim.ListConstruct %2013 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1811 = torch.constant.bool false
    %2015 = torch.aten.index_put %2006, %2014, %2010, %false_1811 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2015, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_1812 = torch.constant.int 32
    %int2_1813 = torch.constant.int 2
    %int32_1814 = torch.constant.int 32
    %int8_1815 = torch.constant.int 8
    %int128_1816 = torch.constant.int 128
    %2016 = torch.prim.ListConstruct %661, %int32_1812, %int2_1813, %int32_1814, %int8_1815, %int128_1816 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2017 = torch.aten.view %2015, %2016 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2017, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_1817 = torch.constant.int 2097152
    %2018 = torch.prim.ListConstruct %661, %int2097152_1817 : (!torch.int, !torch.int) -> !torch.list<int>
    %2019 = torch.aten.view %2017, %2018 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2019, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_1818 = torch.constant.int -2
    %2020 = torch.aten.unsqueeze %1979, %int-2_1818 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2020, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1819 = torch.constant.int 1
    %int8_1820 = torch.constant.int 8
    %int4_1821 = torch.constant.int 4
    %int128_1822 = torch.constant.int 128
    %2021 = torch.prim.ListConstruct %int1_1819, %1961, %int8_1820, %int4_1821, %int128_1822 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1823 = torch.constant.bool false
    %2022 = torch.aten.expand %2020, %2021, %false_1823 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2022, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1824 = torch.constant.int 0
    %2023 = torch.aten.clone %2022, %int0_1824 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2023, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1825 = torch.constant.int 1
    %int32_1826 = torch.constant.int 32
    %int128_1827 = torch.constant.int 128
    %2024 = torch.prim.ListConstruct %int1_1825, %1961, %int32_1826, %int128_1827 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2025 = torch.aten._unsafe_view %2023, %2024 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2025, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_1828 = torch.constant.int -2
    %2026 = torch.aten.unsqueeze %1982, %int-2_1828 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2026, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1829 = torch.constant.int 1
    %2027 = torch.aten.size.int %1913, %int1_1829 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_1830 = torch.constant.int 1
    %int8_1831 = torch.constant.int 8
    %int4_1832 = torch.constant.int 4
    %int128_1833 = torch.constant.int 128
    %2028 = torch.prim.ListConstruct %int1_1830, %2027, %int8_1831, %int4_1832, %int128_1833 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1834 = torch.constant.bool false
    %2029 = torch.aten.expand %2026, %2028, %false_1834 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2029, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1835 = torch.constant.int 0
    %2030 = torch.aten.clone %2029, %int0_1835 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2030, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1836 = torch.constant.int 1
    %int32_1837 = torch.constant.int 32
    %int128_1838 = torch.constant.int 128
    %2031 = torch.prim.ListConstruct %int1_1836, %2027, %int32_1837, %int128_1838 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2032 = torch.aten._unsafe_view %2030, %2031 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2032, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_1839 = torch.constant.int 6
    %2033 = torch.prims.convert_element_type %2025, %int6_1839 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2033, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2034 = torch.aten.mul.Tensor %2033, %110 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2034, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1840 = torch.constant.int 15
    %2035 = torch.prims.convert_element_type %2034, %int15_1840 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2035, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_1841 = torch.constant.int 6
    %2036 = torch.prims.convert_element_type %2032, %int6_1841 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2036, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2037 = torch.aten.mul.Tensor %2036, %110 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2037, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1842 = torch.constant.int 15
    %2038 = torch.prims.convert_element_type %2037, %int15_1842 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2038, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1843 = torch.constant.int 1
    %int2_1844 = torch.constant.int 2
    %2039 = torch.aten.transpose.int %1948, %int1_1843, %int2_1844 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2039, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1845 = torch.constant.int 1
    %int2_1846 = torch.constant.int 2
    %2040 = torch.aten.transpose.int %2035, %int1_1845, %int2_1846 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2040, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1847 = torch.constant.int 1
    %int2_1848 = torch.constant.int 2
    %2041 = torch.aten.transpose.int %2038, %int1_1847, %int2_1848 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2041, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_1849 = torch.constant.float 0.000000e+00
    %true_1850 = torch.constant.bool true
    %none_1851 = torch.constant.none
    %none_1852 = torch.constant.none
    %2042:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2039, %2040, %2041, %float0.000000e00_1849, %true_1850, %none_1851, %none_1852) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2042#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1853 = torch.constant.int 1
    %int2_1854 = torch.constant.int 2
    %2043 = torch.aten.transpose.int %2042#0, %int1_1853, %int2_1854 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2043, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1855 = torch.constant.int 1
    %int4096_1856 = torch.constant.int 4096
    %2044 = torch.prim.ListConstruct %int1_1855, %1933, %int4096_1856 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2045 = torch.aten.view %2043, %2044 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2045, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2046 = torch.aten.div.Tensor %2045, %111 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2046, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1857 = torch.constant.float -2.400000e+02
    %float2.400000e02_1858 = torch.constant.float 2.400000e+02
    %2047 = torch.aten.clamp %2046, %float-2.400000e02_1857, %float2.400000e02_1858 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2047, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1859 = torch.constant.int 26
    %2048 = torch.prims.convert_element_type %2047, %int26_1859 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2048, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1860 = torch.constant.int -2
    %int-1_1861 = torch.constant.int -1
    %2049 = torch.aten.transpose.int %112, %int-2_1860, %int-1_1861 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1862 = torch.constant.int 4096
    %2050 = torch.prim.ListConstruct %1933, %int4096_1862 : (!torch.int, !torch.int) -> !torch.list<int>
    %2051 = torch.aten.view %2048, %2050 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2051, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2052 = torch.aten.mm %2051, %2049 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2052, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1863 = torch.constant.int 1
    %int4096_1864 = torch.constant.int 4096
    %2053 = torch.prim.ListConstruct %int1_1863, %1933, %int4096_1864 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2054 = torch.aten.view %2052, %2053 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2054, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1865 = torch.constant.int 15
    %2055 = torch.prims.convert_element_type %2054, %int15_1865 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2055, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1866 = torch.constant.int 1
    %2056 = torch.aten.add.Tensor %1877, %2055, %int1_1866 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2056, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1867 = torch.constant.int 2
    %2057 = torch.aten.pow.Tensor_Scalar %2056, %int2_1867 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2057, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1868 = torch.constant.int -1
    %2058 = torch.prim.ListConstruct %int-1_1868 : (!torch.int) -> !torch.list<int>
    %true_1869 = torch.constant.bool true
    %none_1870 = torch.constant.none
    %2059 = torch.aten.mean.dim %2057, %2058, %true_1869, %none_1870 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2059, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1871 = torch.constant.float 1.000000e-05
    %int1_1872 = torch.constant.int 1
    %2060 = torch.aten.add.Scalar %2059, %float1.000000e-05_1871, %int1_1872 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2060, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2061 = torch.aten.rsqrt %2060 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2061, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2062 = torch.aten.mul.Tensor %2056, %2061 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2062, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2063 = torch.aten.mul.Tensor %113, %2062 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2063, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2064 = torch.aten.div.Tensor %2063, %114 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2064, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1873 = torch.constant.float -2.400000e+02
    %float2.400000e02_1874 = torch.constant.float 2.400000e+02
    %2065 = torch.aten.clamp %2064, %float-2.400000e02_1873, %float2.400000e02_1874 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2065, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1875 = torch.constant.int 26
    %2066 = torch.prims.convert_element_type %2065, %int26_1875 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2066, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1876 = torch.constant.int -2
    %int-1_1877 = torch.constant.int -1
    %2067 = torch.aten.transpose.int %115, %int-2_1876, %int-1_1877 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1878 = torch.constant.int 4096
    %2068 = torch.prim.ListConstruct %566, %int4096_1878 : (!torch.int, !torch.int) -> !torch.list<int>
    %2069 = torch.aten.view %2066, %2068 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2069, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2070 = torch.aten.mm %2069, %2067 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2070, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1879 = torch.constant.int 1
    %int14336_1880 = torch.constant.int 14336
    %2071 = torch.prim.ListConstruct %int1_1879, %566, %int14336_1880 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2072 = torch.aten.view %2070, %2071 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2072, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1881 = torch.constant.int 15
    %2073 = torch.prims.convert_element_type %2072, %int15_1881 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2073, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2074 = torch.aten.silu %2073 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2074, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2075 = torch.aten.div.Tensor %2063, %116 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2075, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1882 = torch.constant.float -2.400000e+02
    %float2.400000e02_1883 = torch.constant.float 2.400000e+02
    %2076 = torch.aten.clamp %2075, %float-2.400000e02_1882, %float2.400000e02_1883 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2076, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1884 = torch.constant.int 26
    %2077 = torch.prims.convert_element_type %2076, %int26_1884 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2077, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1885 = torch.constant.int -2
    %int-1_1886 = torch.constant.int -1
    %2078 = torch.aten.transpose.int %117, %int-2_1885, %int-1_1886 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1887 = torch.constant.int 4096
    %2079 = torch.prim.ListConstruct %566, %int4096_1887 : (!torch.int, !torch.int) -> !torch.list<int>
    %2080 = torch.aten.view %2077, %2079 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2080, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2081 = torch.aten.mm %2080, %2078 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2081, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1888 = torch.constant.int 1
    %int14336_1889 = torch.constant.int 14336
    %2082 = torch.prim.ListConstruct %int1_1888, %566, %int14336_1889 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2083 = torch.aten.view %2081, %2082 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2083, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1890 = torch.constant.int 15
    %2084 = torch.prims.convert_element_type %2083, %int15_1890 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2084, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2085 = torch.aten.mul.Tensor %2074, %2084 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2085, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2086 = torch.aten.div.Tensor %2085, %118 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2086, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_1891 = torch.constant.float -2.400000e+02
    %float2.400000e02_1892 = torch.constant.float 2.400000e+02
    %2087 = torch.aten.clamp %2086, %float-2.400000e02_1891, %float2.400000e02_1892 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2087, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_1893 = torch.constant.int 26
    %2088 = torch.prims.convert_element_type %2087, %int26_1893 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2088, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_1894 = torch.constant.int -2
    %int-1_1895 = torch.constant.int -1
    %2089 = torch.aten.transpose.int %119, %int-2_1894, %int-1_1895 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_1896 = torch.constant.int 1
    %2090 = torch.aten.size.int %2072, %int1_1896 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_1897 = torch.constant.int 14336
    %2091 = torch.prim.ListConstruct %2090, %int14336_1897 : (!torch.int, !torch.int) -> !torch.list<int>
    %2092 = torch.aten.view %2088, %2091 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2092, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2093 = torch.aten.mm %2092, %2089 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2093, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1898 = torch.constant.int 1
    %int4096_1899 = torch.constant.int 4096
    %2094 = torch.prim.ListConstruct %int1_1898, %2090, %int4096_1899 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2095 = torch.aten.view %2093, %2094 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2095, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1900 = torch.constant.int 15
    %2096 = torch.prims.convert_element_type %2095, %int15_1900 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2096, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1901 = torch.constant.int 1
    %2097 = torch.aten.add.Tensor %2056, %2096, %int1_1901 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2097, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1902 = torch.constant.int 2
    %2098 = torch.aten.pow.Tensor_Scalar %2097, %int2_1902 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2098, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1903 = torch.constant.int -1
    %2099 = torch.prim.ListConstruct %int-1_1903 : (!torch.int) -> !torch.list<int>
    %true_1904 = torch.constant.bool true
    %none_1905 = torch.constant.none
    %2100 = torch.aten.mean.dim %2098, %2099, %true_1904, %none_1905 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2100, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1906 = torch.constant.float 1.000000e-05
    %int1_1907 = torch.constant.int 1
    %2101 = torch.aten.add.Scalar %2100, %float1.000000e-05_1906, %int1_1907 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2101, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2102 = torch.aten.rsqrt %2101 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2102, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2103 = torch.aten.mul.Tensor %2097, %2102 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2103, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2104 = torch.aten.mul.Tensor %120, %2103 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2104, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2105 = torch.aten.div.Tensor %2104, %121 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2105, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1908 = torch.constant.float -2.400000e+02
    %float2.400000e02_1909 = torch.constant.float 2.400000e+02
    %2106 = torch.aten.clamp %2105, %float-2.400000e02_1908, %float2.400000e02_1909 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2106, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1910 = torch.constant.int 26
    %2107 = torch.prims.convert_element_type %2106, %int26_1910 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2107, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1911 = torch.constant.int -2
    %int-1_1912 = torch.constant.int -1
    %2108 = torch.aten.transpose.int %122, %int-2_1911, %int-1_1912 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1913 = torch.constant.int 4096
    %2109 = torch.prim.ListConstruct %566, %int4096_1913 : (!torch.int, !torch.int) -> !torch.list<int>
    %2110 = torch.aten.view %2107, %2109 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2110, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2111 = torch.aten.mm %2110, %2108 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2111, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1914 = torch.constant.int 1
    %int4096_1915 = torch.constant.int 4096
    %2112 = torch.prim.ListConstruct %int1_1914, %566, %int4096_1915 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2113 = torch.aten.view %2111, %2112 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2113, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1916 = torch.constant.int 15
    %2114 = torch.prims.convert_element_type %2113, %int15_1916 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2114, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2115 = torch.aten.div.Tensor %2104, %123 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2115, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1917 = torch.constant.float -2.400000e+02
    %float2.400000e02_1918 = torch.constant.float 2.400000e+02
    %2116 = torch.aten.clamp %2115, %float-2.400000e02_1917, %float2.400000e02_1918 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2116, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1919 = torch.constant.int 26
    %2117 = torch.prims.convert_element_type %2116, %int26_1919 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2117, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1920 = torch.constant.int -2
    %int-1_1921 = torch.constant.int -1
    %2118 = torch.aten.transpose.int %124, %int-2_1920, %int-1_1921 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1922 = torch.constant.int 4096
    %2119 = torch.prim.ListConstruct %566, %int4096_1922 : (!torch.int, !torch.int) -> !torch.list<int>
    %2120 = torch.aten.view %2117, %2119 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2120, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2121 = torch.aten.mm %2120, %2118 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2121, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1923 = torch.constant.int 1
    %int1024_1924 = torch.constant.int 1024
    %2122 = torch.prim.ListConstruct %int1_1923, %566, %int1024_1924 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2123 = torch.aten.view %2121, %2122 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2123, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1925 = torch.constant.int 15
    %2124 = torch.prims.convert_element_type %2123, %int15_1925 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2124, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %2125 = torch.aten.div.Tensor %2104, %125 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2125, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_1926 = torch.constant.float -2.400000e+02
    %float2.400000e02_1927 = torch.constant.float 2.400000e+02
    %2126 = torch.aten.clamp %2125, %float-2.400000e02_1926, %float2.400000e02_1927 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2126, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_1928 = torch.constant.int 26
    %2127 = torch.prims.convert_element_type %2126, %int26_1928 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2127, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1929 = torch.constant.int -2
    %int-1_1930 = torch.constant.int -1
    %2128 = torch.aten.transpose.int %126, %int-2_1929, %int-1_1930 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1931 = torch.constant.int 4096
    %2129 = torch.prim.ListConstruct %566, %int4096_1931 : (!torch.int, !torch.int) -> !torch.list<int>
    %2130 = torch.aten.view %2127, %2129 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2130, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2131 = torch.aten.mm %2130, %2128 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2131, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1932 = torch.constant.int 1
    %int1024_1933 = torch.constant.int 1024
    %2132 = torch.prim.ListConstruct %int1_1932, %566, %int1024_1933 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2133 = torch.aten.view %2131, %2132 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2133, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1934 = torch.constant.int 15
    %2134 = torch.prims.convert_element_type %2133, %int15_1934 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2134, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_1935 = torch.constant.int 1
    %int32_1936 = torch.constant.int 32
    %int128_1937 = torch.constant.int 128
    %2135 = torch.prim.ListConstruct %int1_1935, %566, %int32_1936, %int128_1937 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2136 = torch.aten.view %2114, %2135 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2136, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1938 = torch.constant.int 1
    %int8_1939 = torch.constant.int 8
    %int128_1940 = torch.constant.int 128
    %2137 = torch.prim.ListConstruct %int1_1938, %566, %int8_1939, %int128_1940 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2138 = torch.aten.view %2124, %2137 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2138, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_1941 = torch.constant.int 1
    %int8_1942 = torch.constant.int 8
    %int128_1943 = torch.constant.int 128
    %2139 = torch.prim.ListConstruct %int1_1941, %566, %int8_1942, %int128_1943 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2140 = torch.aten.view %2134, %2139 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2140, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_1944 = torch.constant.int 131072
    %none_1945 = torch.constant.none
    %none_1946 = torch.constant.none
    %cpu_1947 = torch.constant.device "cpu"
    %false_1948 = torch.constant.bool false
    %2141 = torch.aten.arange %int131072_1944, %none_1945, %none_1946, %cpu_1947, %false_1948 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1949 = torch.constant.int 0
    %int128_1950 = torch.constant.int 128
    %none_1951 = torch.constant.none
    %none_1952 = torch.constant.none
    %cpu_1953 = torch.constant.device "cpu"
    %false_1954 = torch.constant.bool false
    %2142 = torch.aten.arange.start %int0_1949, %int128_1950, %none_1951, %none_1952, %cpu_1953, %false_1954 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_1955 = torch.constant.int 2
    %2143 = torch.aten.floor_divide.Scalar %2142, %int2_1955 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_1956 = torch.constant.int 6
    %2144 = torch.prims.convert_element_type %2143, %int6_1956 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_1957 = torch.constant.int 128
    %2145 = torch.aten.div.Scalar %2144, %int128_1957 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_1958 = torch.constant.float 2.000000e+00
    %2146 = torch.aten.mul.Scalar %2145, %float2.000000e00_1958 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_1959 = torch.constant.float 5.000000e+05
    %2147 = torch.aten.pow.Scalar %float5.000000e05_1959, %2146 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2148 = torch.aten.reciprocal %2147 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_1960 = torch.constant.float 1.000000e+00
    %2149 = torch.aten.mul.Scalar %2148, %float1.000000e00_1960 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_1961 = torch.constant.int 131072
    %int1_1962 = torch.constant.int 1
    %2150 = torch.prim.ListConstruct %int131072_1961, %int1_1962 : (!torch.int, !torch.int) -> !torch.list<int>
    %2151 = torch.aten.view %2141, %2150 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2152 = torch.aten.mul.Tensor %2151, %2149 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_1963 = torch.constant.int 1
    %2153 = torch.aten.size.int %2113, %int1_1963 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1964 = torch.constant.int 0
    %2154 = torch.aten.add.int %int0_1964, %2153 : !torch.int, !torch.int -> !torch.int
    %int0_1965 = torch.constant.int 0
    %int0_1966 = torch.constant.int 0
    %int1_1967 = torch.constant.int 1
    %2155 = torch.aten.slice.Tensor %2152, %int0_1965, %int0_1966, %2154, %int1_1967 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2155, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1968 = torch.constant.int 1
    %int0_1969 = torch.constant.int 0
    %int9223372036854775807_1970 = torch.constant.int 9223372036854775807
    %int1_1971 = torch.constant.int 1
    %2156 = torch.aten.slice.Tensor %2155, %int1_1968, %int0_1969, %int9223372036854775807_1970, %int1_1971 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2156, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_1972 = torch.constant.int 1
    %int0_1973 = torch.constant.int 0
    %int9223372036854775807_1974 = torch.constant.int 9223372036854775807
    %int1_1975 = torch.constant.int 1
    %2157 = torch.aten.slice.Tensor %2156, %int1_1972, %int0_1973, %int9223372036854775807_1974, %int1_1975 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2157, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_1976 = torch.constant.int 0
    %2158 = torch.aten.unsqueeze %2157, %int0_1976 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2158, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1977 = torch.constant.int 1
    %int0_1978 = torch.constant.int 0
    %int9223372036854775807_1979 = torch.constant.int 9223372036854775807
    %int1_1980 = torch.constant.int 1
    %2159 = torch.aten.slice.Tensor %2158, %int1_1977, %int0_1978, %int9223372036854775807_1979, %int1_1980 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2159, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_1981 = torch.constant.int 2
    %int0_1982 = torch.constant.int 0
    %int9223372036854775807_1983 = torch.constant.int 9223372036854775807
    %int1_1984 = torch.constant.int 1
    %2160 = torch.aten.slice.Tensor %2159, %int2_1981, %int0_1982, %int9223372036854775807_1983, %int1_1984 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2160, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_1985 = torch.constant.int 1
    %int1_1986 = torch.constant.int 1
    %int1_1987 = torch.constant.int 1
    %2161 = torch.prim.ListConstruct %int1_1985, %int1_1986, %int1_1987 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2162 = torch.aten.repeat %2160, %2161 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2162, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_1988 = torch.constant.int 6
    %2163 = torch.prims.convert_element_type %2136, %int6_1988 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2163, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2164 = torch_c.to_builtin_tensor %2163 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %2165 = torch_c.to_builtin_tensor %2162 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2166 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2164, %2165) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %2167 = torch_c.from_builtin_tensor %2166 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2167, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1989 = torch.constant.int 15
    %2168 = torch.prims.convert_element_type %2167, %int15_1989 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2168, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_1990 = torch.constant.int 131072
    %none_1991 = torch.constant.none
    %none_1992 = torch.constant.none
    %cpu_1993 = torch.constant.device "cpu"
    %false_1994 = torch.constant.bool false
    %2169 = torch.aten.arange %int131072_1990, %none_1991, %none_1992, %cpu_1993, %false_1994 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1995 = torch.constant.int 0
    %int128_1996 = torch.constant.int 128
    %none_1997 = torch.constant.none
    %none_1998 = torch.constant.none
    %cpu_1999 = torch.constant.device "cpu"
    %false_2000 = torch.constant.bool false
    %2170 = torch.aten.arange.start %int0_1995, %int128_1996, %none_1997, %none_1998, %cpu_1999, %false_2000 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_2001 = torch.constant.int 2
    %2171 = torch.aten.floor_divide.Scalar %2170, %int2_2001 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_2002 = torch.constant.int 6
    %2172 = torch.prims.convert_element_type %2171, %int6_2002 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_2003 = torch.constant.int 128
    %2173 = torch.aten.div.Scalar %2172, %int128_2003 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_2004 = torch.constant.float 2.000000e+00
    %2174 = torch.aten.mul.Scalar %2173, %float2.000000e00_2004 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_2005 = torch.constant.float 5.000000e+05
    %2175 = torch.aten.pow.Scalar %float5.000000e05_2005, %2174 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2176 = torch.aten.reciprocal %2175 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_2006 = torch.constant.float 1.000000e+00
    %2177 = torch.aten.mul.Scalar %2176, %float1.000000e00_2006 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_2007 = torch.constant.int 131072
    %int1_2008 = torch.constant.int 1
    %2178 = torch.prim.ListConstruct %int131072_2007, %int1_2008 : (!torch.int, !torch.int) -> !torch.list<int>
    %2179 = torch.aten.view %2169, %2178 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2180 = torch.aten.mul.Tensor %2179, %2177 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_2009 = torch.constant.int 1
    %2181 = torch.aten.size.int %2123, %int1_2009 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2010 = torch.constant.int 0
    %2182 = torch.aten.add.int %int0_2010, %2181 : !torch.int, !torch.int -> !torch.int
    %int0_2011 = torch.constant.int 0
    %int0_2012 = torch.constant.int 0
    %int1_2013 = torch.constant.int 1
    %2183 = torch.aten.slice.Tensor %2180, %int0_2011, %int0_2012, %2182, %int1_2013 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2183, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2014 = torch.constant.int 1
    %int0_2015 = torch.constant.int 0
    %int9223372036854775807_2016 = torch.constant.int 9223372036854775807
    %int1_2017 = torch.constant.int 1
    %2184 = torch.aten.slice.Tensor %2183, %int1_2014, %int0_2015, %int9223372036854775807_2016, %int1_2017 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2184, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2018 = torch.constant.int 1
    %int0_2019 = torch.constant.int 0
    %int9223372036854775807_2020 = torch.constant.int 9223372036854775807
    %int1_2021 = torch.constant.int 1
    %2185 = torch.aten.slice.Tensor %2184, %int1_2018, %int0_2019, %int9223372036854775807_2020, %int1_2021 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2185, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_2022 = torch.constant.int 0
    %2186 = torch.aten.unsqueeze %2185, %int0_2022 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2186, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2023 = torch.constant.int 1
    %int0_2024 = torch.constant.int 0
    %int9223372036854775807_2025 = torch.constant.int 9223372036854775807
    %int1_2026 = torch.constant.int 1
    %2187 = torch.aten.slice.Tensor %2186, %int1_2023, %int0_2024, %int9223372036854775807_2025, %int1_2026 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2187, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_2027 = torch.constant.int 2
    %int0_2028 = torch.constant.int 0
    %int9223372036854775807_2029 = torch.constant.int 9223372036854775807
    %int1_2030 = torch.constant.int 1
    %2188 = torch.aten.slice.Tensor %2187, %int2_2027, %int0_2028, %int9223372036854775807_2029, %int1_2030 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2188, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2031 = torch.constant.int 1
    %int1_2032 = torch.constant.int 1
    %int1_2033 = torch.constant.int 1
    %2189 = torch.prim.ListConstruct %int1_2031, %int1_2032, %int1_2033 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2190 = torch.aten.repeat %2188, %2189 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2190, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_2034 = torch.constant.int 6
    %2191 = torch.prims.convert_element_type %2138, %int6_2034 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2191, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %2192 = torch_c.to_builtin_tensor %2191 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %2193 = torch_c.to_builtin_tensor %2190 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2194 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2192, %2193) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %2195 = torch_c.from_builtin_tensor %2194 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2195, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_2035 = torch.constant.int 15
    %2196 = torch.prims.convert_element_type %2195, %int15_2035 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2196, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %2197 = torch.aten.div.Tensor %2196, %127 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2197, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2036 = torch.constant.float -2.400000e+02
    %float2.400000e02_2037 = torch.constant.float 2.400000e+02
    %2198 = torch.aten.clamp %2197, %float-2.400000e02_2036, %float2.400000e02_2037 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2198, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2038 = torch.constant.int 26
    %2199 = torch.prims.convert_element_type %2198, %int26_2038 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2199, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %2200 = torch.aten.div.Tensor %2140, %127 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2200, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2039 = torch.constant.float -2.400000e+02
    %float2.400000e02_2040 = torch.constant.float 2.400000e+02
    %2201 = torch.aten.clamp %2200, %float-2.400000e02_2039, %float2.400000e02_2040 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2201, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2041 = torch.constant.int 26
    %2202 = torch.prims.convert_element_type %2201, %int26_2041 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2202, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_2042 = torch.constant.int 64
    %2203 = torch.aten.mul.Scalar %arg2, %int64_2042 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2203, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int14 = torch.constant.int 14
    %int1_2043 = torch.constant.int 1
    %2204 = torch.aten.add.Scalar %2203, %int14, %int1_2043 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2204, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_2044 = torch.constant.int 1
    %int32_2045 = torch.constant.int 32
    %int8_2046 = torch.constant.int 8
    %int128_2047 = torch.constant.int 128
    %2205 = torch.prim.ListConstruct %int1_2044, %670, %int32_2045, %int8_2046, %int128_2047 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2206 = torch.aten.view %2199, %2205 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2206, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2048 = torch.constant.int 32
    %int8_2049 = torch.constant.int 8
    %int128_2050 = torch.constant.int 128
    %2207 = torch.prim.ListConstruct %670, %int32_2048, %int8_2049, %int128_2050 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2208 = torch.aten.view %2206, %2207 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2208, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2209 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2210 = torch.aten.view %2204, %2209 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2210, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_2051 = torch.constant.int 32
    %int2_2052 = torch.constant.int 2
    %int32_2053 = torch.constant.int 32
    %int8_2054 = torch.constant.int 8
    %int128_2055 = torch.constant.int 128
    %2211 = torch.prim.ListConstruct %661, %int32_2051, %int2_2052, %int32_2053, %int8_2054, %int128_2055 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2212 = torch.aten.view %2019, %2211 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2212, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2056 = torch.constant.int 32
    %2213 = torch.aten.mul.int %661, %int32_2056 : !torch.int, !torch.int -> !torch.int
    %int2_2057 = torch.constant.int 2
    %2214 = torch.aten.mul.int %2213, %int2_2057 : !torch.int, !torch.int -> !torch.int
    %int32_2058 = torch.constant.int 32
    %int8_2059 = torch.constant.int 8
    %int128_2060 = torch.constant.int 128
    %2215 = torch.prim.ListConstruct %2214, %int32_2058, %int8_2059, %int128_2060 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2216 = torch.aten.view %2212, %2215 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2216, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %2217 = torch.prim.ListConstruct %2210 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2061 = torch.constant.bool false
    %2218 = torch.aten.index_put %2216, %2217, %2208, %false_2061 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2218, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2062 = torch.constant.int 32
    %int2_2063 = torch.constant.int 2
    %int32_2064 = torch.constant.int 32
    %int8_2065 = torch.constant.int 8
    %int128_2066 = torch.constant.int 128
    %2219 = torch.prim.ListConstruct %661, %int32_2062, %int2_2063, %int32_2064, %int8_2065, %int128_2066 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2220 = torch.aten.view %2218, %2219 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2220, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2067 = torch.constant.int 2097152
    %2221 = torch.prim.ListConstruct %661, %int2097152_2067 : (!torch.int, !torch.int) -> !torch.list<int>
    %2222 = torch.aten.view %2220, %2221 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2222, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_2068 = torch.constant.int 32
    %int2_2069 = torch.constant.int 2
    %int32_2070 = torch.constant.int 32
    %int8_2071 = torch.constant.int 8
    %int128_2072 = torch.constant.int 128
    %2223 = torch.prim.ListConstruct %661, %int32_2068, %int2_2069, %int32_2070, %int8_2071, %int128_2072 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2224 = torch.aten.view %2222, %2223 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2224, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2073 = torch.constant.int 32
    %int8_2074 = torch.constant.int 8
    %int128_2075 = torch.constant.int 128
    %2225 = torch.prim.ListConstruct %2214, %int32_2073, %int8_2074, %int128_2075 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2226 = torch.aten.view %2224, %2225 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2226, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_2076 = torch.constant.int 1
    %int32_2077 = torch.constant.int 32
    %int8_2078 = torch.constant.int 8
    %int128_2079 = torch.constant.int 128
    %2227 = torch.prim.ListConstruct %int1_2076, %670, %int32_2077, %int8_2078, %int128_2079 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2228 = torch.aten.view %2202, %2227 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2228, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2080 = torch.constant.int 32
    %int8_2081 = torch.constant.int 8
    %int128_2082 = torch.constant.int 128
    %2229 = torch.prim.ListConstruct %670, %int32_2080, %int8_2081, %int128_2082 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2230 = torch.aten.view %2228, %2229 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2230, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2083 = torch.constant.int 1
    %int1_2084 = torch.constant.int 1
    %2231 = torch.aten.add.Scalar %2204, %int1_2083, %int1_2084 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2231, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2232 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2233 = torch.aten.view %2231, %2232 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2233, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2234 = torch.prim.ListConstruct %2233 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2085 = torch.constant.bool false
    %2235 = torch.aten.index_put %2226, %2234, %2230, %false_2085 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2235, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2086 = torch.constant.int 32
    %int2_2087 = torch.constant.int 2
    %int32_2088 = torch.constant.int 32
    %int8_2089 = torch.constant.int 8
    %int128_2090 = torch.constant.int 128
    %2236 = torch.prim.ListConstruct %661, %int32_2086, %int2_2087, %int32_2088, %int8_2089, %int128_2090 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2237 = torch.aten.view %2235, %2236 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2237, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2091 = torch.constant.int 2097152
    %2238 = torch.prim.ListConstruct %661, %int2097152_2091 : (!torch.int, !torch.int) -> !torch.list<int>
    %2239 = torch.aten.view %2237, %2238 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2239, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_2092 = torch.constant.int -2
    %2240 = torch.aten.unsqueeze %2199, %int-2_2092 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2240, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2093 = torch.constant.int 1
    %int8_2094 = torch.constant.int 8
    %int4_2095 = torch.constant.int 4
    %int128_2096 = torch.constant.int 128
    %2241 = torch.prim.ListConstruct %int1_2093, %2181, %int8_2094, %int4_2095, %int128_2096 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2097 = torch.constant.bool false
    %2242 = torch.aten.expand %2240, %2241, %false_2097 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2242, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2098 = torch.constant.int 0
    %2243 = torch.aten.clone %2242, %int0_2098 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2243, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2099 = torch.constant.int 1
    %int32_2100 = torch.constant.int 32
    %int128_2101 = torch.constant.int 128
    %2244 = torch.prim.ListConstruct %int1_2099, %2181, %int32_2100, %int128_2101 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2245 = torch.aten._unsafe_view %2243, %2244 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2245, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_2102 = torch.constant.int -2
    %2246 = torch.aten.unsqueeze %2202, %int-2_2102 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2246, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2103 = torch.constant.int 1
    %2247 = torch.aten.size.int %2133, %int1_2103 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_2104 = torch.constant.int 1
    %int8_2105 = torch.constant.int 8
    %int4_2106 = torch.constant.int 4
    %int128_2107 = torch.constant.int 128
    %2248 = torch.prim.ListConstruct %int1_2104, %2247, %int8_2105, %int4_2106, %int128_2107 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2108 = torch.constant.bool false
    %2249 = torch.aten.expand %2246, %2248, %false_2108 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2249, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2109 = torch.constant.int 0
    %2250 = torch.aten.clone %2249, %int0_2109 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2250, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2110 = torch.constant.int 1
    %int32_2111 = torch.constant.int 32
    %int128_2112 = torch.constant.int 128
    %2251 = torch.prim.ListConstruct %int1_2110, %2247, %int32_2111, %int128_2112 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2252 = torch.aten._unsafe_view %2250, %2251 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2252, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_2113 = torch.constant.int 6
    %2253 = torch.prims.convert_element_type %2245, %int6_2113 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2253, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2254 = torch.aten.mul.Tensor %2253, %127 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2254, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2114 = torch.constant.int 15
    %2255 = torch.prims.convert_element_type %2254, %int15_2114 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2255, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_2115 = torch.constant.int 6
    %2256 = torch.prims.convert_element_type %2252, %int6_2115 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2256, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2257 = torch.aten.mul.Tensor %2256, %127 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2257, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2116 = torch.constant.int 15
    %2258 = torch.prims.convert_element_type %2257, %int15_2116 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2258, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2117 = torch.constant.int 1
    %int2_2118 = torch.constant.int 2
    %2259 = torch.aten.transpose.int %2168, %int1_2117, %int2_2118 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2259, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2119 = torch.constant.int 1
    %int2_2120 = torch.constant.int 2
    %2260 = torch.aten.transpose.int %2255, %int1_2119, %int2_2120 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2260, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2121 = torch.constant.int 1
    %int2_2122 = torch.constant.int 2
    %2261 = torch.aten.transpose.int %2258, %int1_2121, %int2_2122 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2261, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_2123 = torch.constant.float 0.000000e+00
    %true_2124 = torch.constant.bool true
    %none_2125 = torch.constant.none
    %none_2126 = torch.constant.none
    %2262:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2259, %2260, %2261, %float0.000000e00_2123, %true_2124, %none_2125, %none_2126) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2262#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2127 = torch.constant.int 1
    %int2_2128 = torch.constant.int 2
    %2263 = torch.aten.transpose.int %2262#0, %int1_2127, %int2_2128 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2263, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2129 = torch.constant.int 1
    %int4096_2130 = torch.constant.int 4096
    %2264 = torch.prim.ListConstruct %int1_2129, %2153, %int4096_2130 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2265 = torch.aten.view %2263, %2264 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2265, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2266 = torch.aten.div.Tensor %2265, %128 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2266, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2131 = torch.constant.float -2.400000e+02
    %float2.400000e02_2132 = torch.constant.float 2.400000e+02
    %2267 = torch.aten.clamp %2266, %float-2.400000e02_2131, %float2.400000e02_2132 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2267, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2133 = torch.constant.int 26
    %2268 = torch.prims.convert_element_type %2267, %int26_2133 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2268, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2134 = torch.constant.int -2
    %int-1_2135 = torch.constant.int -1
    %2269 = torch.aten.transpose.int %129, %int-2_2134, %int-1_2135 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2136 = torch.constant.int 4096
    %2270 = torch.prim.ListConstruct %2153, %int4096_2136 : (!torch.int, !torch.int) -> !torch.list<int>
    %2271 = torch.aten.view %2268, %2270 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2271, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2272 = torch.aten.mm %2271, %2269 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2272, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2137 = torch.constant.int 1
    %int4096_2138 = torch.constant.int 4096
    %2273 = torch.prim.ListConstruct %int1_2137, %2153, %int4096_2138 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2274 = torch.aten.view %2272, %2273 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2274, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2139 = torch.constant.int 15
    %2275 = torch.prims.convert_element_type %2274, %int15_2139 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2275, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2140 = torch.constant.int 1
    %2276 = torch.aten.add.Tensor %2097, %2275, %int1_2140 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2276, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2141 = torch.constant.int 2
    %2277 = torch.aten.pow.Tensor_Scalar %2276, %int2_2141 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2277, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2142 = torch.constant.int -1
    %2278 = torch.prim.ListConstruct %int-1_2142 : (!torch.int) -> !torch.list<int>
    %true_2143 = torch.constant.bool true
    %none_2144 = torch.constant.none
    %2279 = torch.aten.mean.dim %2277, %2278, %true_2143, %none_2144 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2279, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2145 = torch.constant.float 1.000000e-05
    %int1_2146 = torch.constant.int 1
    %2280 = torch.aten.add.Scalar %2279, %float1.000000e-05_2145, %int1_2146 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2280, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2281 = torch.aten.rsqrt %2280 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2281, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2282 = torch.aten.mul.Tensor %2276, %2281 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2282, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2283 = torch.aten.mul.Tensor %130, %2282 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2283, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2284 = torch.aten.div.Tensor %2283, %131 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2284, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2147 = torch.constant.float -2.400000e+02
    %float2.400000e02_2148 = torch.constant.float 2.400000e+02
    %2285 = torch.aten.clamp %2284, %float-2.400000e02_2147, %float2.400000e02_2148 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2285, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2149 = torch.constant.int 26
    %2286 = torch.prims.convert_element_type %2285, %int26_2149 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2286, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2150 = torch.constant.int -2
    %int-1_2151 = torch.constant.int -1
    %2287 = torch.aten.transpose.int %132, %int-2_2150, %int-1_2151 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2152 = torch.constant.int 4096
    %2288 = torch.prim.ListConstruct %566, %int4096_2152 : (!torch.int, !torch.int) -> !torch.list<int>
    %2289 = torch.aten.view %2286, %2288 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2289, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2290 = torch.aten.mm %2289, %2287 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2290, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2153 = torch.constant.int 1
    %int14336_2154 = torch.constant.int 14336
    %2291 = torch.prim.ListConstruct %int1_2153, %566, %int14336_2154 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2292 = torch.aten.view %2290, %2291 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2292, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2155 = torch.constant.int 15
    %2293 = torch.prims.convert_element_type %2292, %int15_2155 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2293, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2294 = torch.aten.silu %2293 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2294, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2295 = torch.aten.div.Tensor %2283, %133 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2295, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2156 = torch.constant.float -2.400000e+02
    %float2.400000e02_2157 = torch.constant.float 2.400000e+02
    %2296 = torch.aten.clamp %2295, %float-2.400000e02_2156, %float2.400000e02_2157 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2296, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2158 = torch.constant.int 26
    %2297 = torch.prims.convert_element_type %2296, %int26_2158 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2297, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2159 = torch.constant.int -2
    %int-1_2160 = torch.constant.int -1
    %2298 = torch.aten.transpose.int %134, %int-2_2159, %int-1_2160 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2161 = torch.constant.int 4096
    %2299 = torch.prim.ListConstruct %566, %int4096_2161 : (!torch.int, !torch.int) -> !torch.list<int>
    %2300 = torch.aten.view %2297, %2299 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2300, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2301 = torch.aten.mm %2300, %2298 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2301, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2162 = torch.constant.int 1
    %int14336_2163 = torch.constant.int 14336
    %2302 = torch.prim.ListConstruct %int1_2162, %566, %int14336_2163 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2303 = torch.aten.view %2301, %2302 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2303, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2164 = torch.constant.int 15
    %2304 = torch.prims.convert_element_type %2303, %int15_2164 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2304, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2305 = torch.aten.mul.Tensor %2294, %2304 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2305, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2306 = torch.aten.div.Tensor %2305, %135 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2306, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_2165 = torch.constant.float -2.400000e+02
    %float2.400000e02_2166 = torch.constant.float 2.400000e+02
    %2307 = torch.aten.clamp %2306, %float-2.400000e02_2165, %float2.400000e02_2166 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2307, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_2167 = torch.constant.int 26
    %2308 = torch.prims.convert_element_type %2307, %int26_2167 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2308, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_2168 = torch.constant.int -2
    %int-1_2169 = torch.constant.int -1
    %2309 = torch.aten.transpose.int %136, %int-2_2168, %int-1_2169 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_2170 = torch.constant.int 1
    %2310 = torch.aten.size.int %2292, %int1_2170 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_2171 = torch.constant.int 14336
    %2311 = torch.prim.ListConstruct %2310, %int14336_2171 : (!torch.int, !torch.int) -> !torch.list<int>
    %2312 = torch.aten.view %2308, %2311 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2312, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2313 = torch.aten.mm %2312, %2309 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2313, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2172 = torch.constant.int 1
    %int4096_2173 = torch.constant.int 4096
    %2314 = torch.prim.ListConstruct %int1_2172, %2310, %int4096_2173 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2315 = torch.aten.view %2313, %2314 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2315, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2174 = torch.constant.int 15
    %2316 = torch.prims.convert_element_type %2315, %int15_2174 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2316, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2175 = torch.constant.int 1
    %2317 = torch.aten.add.Tensor %2276, %2316, %int1_2175 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2317, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2176 = torch.constant.int 2
    %2318 = torch.aten.pow.Tensor_Scalar %2317, %int2_2176 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2318, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2177 = torch.constant.int -1
    %2319 = torch.prim.ListConstruct %int-1_2177 : (!torch.int) -> !torch.list<int>
    %true_2178 = torch.constant.bool true
    %none_2179 = torch.constant.none
    %2320 = torch.aten.mean.dim %2318, %2319, %true_2178, %none_2179 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2320, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2180 = torch.constant.float 1.000000e-05
    %int1_2181 = torch.constant.int 1
    %2321 = torch.aten.add.Scalar %2320, %float1.000000e-05_2180, %int1_2181 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2321, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2322 = torch.aten.rsqrt %2321 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2322, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2323 = torch.aten.mul.Tensor %2317, %2322 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2323, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2324 = torch.aten.mul.Tensor %137, %2323 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2324, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2325 = torch.aten.div.Tensor %2324, %138 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2325, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2182 = torch.constant.float -2.400000e+02
    %float2.400000e02_2183 = torch.constant.float 2.400000e+02
    %2326 = torch.aten.clamp %2325, %float-2.400000e02_2182, %float2.400000e02_2183 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2326, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2184 = torch.constant.int 26
    %2327 = torch.prims.convert_element_type %2326, %int26_2184 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2327, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2185 = torch.constant.int -2
    %int-1_2186 = torch.constant.int -1
    %2328 = torch.aten.transpose.int %139, %int-2_2185, %int-1_2186 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2187 = torch.constant.int 4096
    %2329 = torch.prim.ListConstruct %566, %int4096_2187 : (!torch.int, !torch.int) -> !torch.list<int>
    %2330 = torch.aten.view %2327, %2329 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2330, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2331 = torch.aten.mm %2330, %2328 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2331, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2188 = torch.constant.int 1
    %int4096_2189 = torch.constant.int 4096
    %2332 = torch.prim.ListConstruct %int1_2188, %566, %int4096_2189 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2333 = torch.aten.view %2331, %2332 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2333, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2190 = torch.constant.int 15
    %2334 = torch.prims.convert_element_type %2333, %int15_2190 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2334, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2335 = torch.aten.div.Tensor %2324, %140 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2335, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2191 = torch.constant.float -2.400000e+02
    %float2.400000e02_2192 = torch.constant.float 2.400000e+02
    %2336 = torch.aten.clamp %2335, %float-2.400000e02_2191, %float2.400000e02_2192 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2336, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2193 = torch.constant.int 26
    %2337 = torch.prims.convert_element_type %2336, %int26_2193 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2337, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2194 = torch.constant.int -2
    %int-1_2195 = torch.constant.int -1
    %2338 = torch.aten.transpose.int %141, %int-2_2194, %int-1_2195 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2196 = torch.constant.int 4096
    %2339 = torch.prim.ListConstruct %566, %int4096_2196 : (!torch.int, !torch.int) -> !torch.list<int>
    %2340 = torch.aten.view %2337, %2339 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2340, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2341 = torch.aten.mm %2340, %2338 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2341, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2197 = torch.constant.int 1
    %int1024_2198 = torch.constant.int 1024
    %2342 = torch.prim.ListConstruct %int1_2197, %566, %int1024_2198 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2343 = torch.aten.view %2341, %2342 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2343, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2199 = torch.constant.int 15
    %2344 = torch.prims.convert_element_type %2343, %int15_2199 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2344, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %2345 = torch.aten.div.Tensor %2324, %142 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2345, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2200 = torch.constant.float -2.400000e+02
    %float2.400000e02_2201 = torch.constant.float 2.400000e+02
    %2346 = torch.aten.clamp %2345, %float-2.400000e02_2200, %float2.400000e02_2201 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2346, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2202 = torch.constant.int 26
    %2347 = torch.prims.convert_element_type %2346, %int26_2202 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2347, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2203 = torch.constant.int -2
    %int-1_2204 = torch.constant.int -1
    %2348 = torch.aten.transpose.int %143, %int-2_2203, %int-1_2204 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2205 = torch.constant.int 4096
    %2349 = torch.prim.ListConstruct %566, %int4096_2205 : (!torch.int, !torch.int) -> !torch.list<int>
    %2350 = torch.aten.view %2347, %2349 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2350, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2351 = torch.aten.mm %2350, %2348 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2351, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2206 = torch.constant.int 1
    %int1024_2207 = torch.constant.int 1024
    %2352 = torch.prim.ListConstruct %int1_2206, %566, %int1024_2207 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2353 = torch.aten.view %2351, %2352 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2353, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2208 = torch.constant.int 15
    %2354 = torch.prims.convert_element_type %2353, %int15_2208 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2354, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_2209 = torch.constant.int 1
    %int32_2210 = torch.constant.int 32
    %int128_2211 = torch.constant.int 128
    %2355 = torch.prim.ListConstruct %int1_2209, %566, %int32_2210, %int128_2211 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2356 = torch.aten.view %2334, %2355 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2356, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2212 = torch.constant.int 1
    %int8_2213 = torch.constant.int 8
    %int128_2214 = torch.constant.int 128
    %2357 = torch.prim.ListConstruct %int1_2212, %566, %int8_2213, %int128_2214 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2358 = torch.aten.view %2344, %2357 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2358, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_2215 = torch.constant.int 1
    %int8_2216 = torch.constant.int 8
    %int128_2217 = torch.constant.int 128
    %2359 = torch.prim.ListConstruct %int1_2215, %566, %int8_2216, %int128_2217 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2360 = torch.aten.view %2354, %2359 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2360, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_2218 = torch.constant.int 131072
    %none_2219 = torch.constant.none
    %none_2220 = torch.constant.none
    %cpu_2221 = torch.constant.device "cpu"
    %false_2222 = torch.constant.bool false
    %2361 = torch.aten.arange %int131072_2218, %none_2219, %none_2220, %cpu_2221, %false_2222 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2223 = torch.constant.int 0
    %int128_2224 = torch.constant.int 128
    %none_2225 = torch.constant.none
    %none_2226 = torch.constant.none
    %cpu_2227 = torch.constant.device "cpu"
    %false_2228 = torch.constant.bool false
    %2362 = torch.aten.arange.start %int0_2223, %int128_2224, %none_2225, %none_2226, %cpu_2227, %false_2228 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_2229 = torch.constant.int 2
    %2363 = torch.aten.floor_divide.Scalar %2362, %int2_2229 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_2230 = torch.constant.int 6
    %2364 = torch.prims.convert_element_type %2363, %int6_2230 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_2231 = torch.constant.int 128
    %2365 = torch.aten.div.Scalar %2364, %int128_2231 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_2232 = torch.constant.float 2.000000e+00
    %2366 = torch.aten.mul.Scalar %2365, %float2.000000e00_2232 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_2233 = torch.constant.float 5.000000e+05
    %2367 = torch.aten.pow.Scalar %float5.000000e05_2233, %2366 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2368 = torch.aten.reciprocal %2367 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_2234 = torch.constant.float 1.000000e+00
    %2369 = torch.aten.mul.Scalar %2368, %float1.000000e00_2234 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_2235 = torch.constant.int 131072
    %int1_2236 = torch.constant.int 1
    %2370 = torch.prim.ListConstruct %int131072_2235, %int1_2236 : (!torch.int, !torch.int) -> !torch.list<int>
    %2371 = torch.aten.view %2361, %2370 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2372 = torch.aten.mul.Tensor %2371, %2369 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_2237 = torch.constant.int 1
    %2373 = torch.aten.size.int %2333, %int1_2237 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2238 = torch.constant.int 0
    %2374 = torch.aten.add.int %int0_2238, %2373 : !torch.int, !torch.int -> !torch.int
    %int0_2239 = torch.constant.int 0
    %int0_2240 = torch.constant.int 0
    %int1_2241 = torch.constant.int 1
    %2375 = torch.aten.slice.Tensor %2372, %int0_2239, %int0_2240, %2374, %int1_2241 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2375, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2242 = torch.constant.int 1
    %int0_2243 = torch.constant.int 0
    %int9223372036854775807_2244 = torch.constant.int 9223372036854775807
    %int1_2245 = torch.constant.int 1
    %2376 = torch.aten.slice.Tensor %2375, %int1_2242, %int0_2243, %int9223372036854775807_2244, %int1_2245 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2376, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2246 = torch.constant.int 1
    %int0_2247 = torch.constant.int 0
    %int9223372036854775807_2248 = torch.constant.int 9223372036854775807
    %int1_2249 = torch.constant.int 1
    %2377 = torch.aten.slice.Tensor %2376, %int1_2246, %int0_2247, %int9223372036854775807_2248, %int1_2249 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2377, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_2250 = torch.constant.int 0
    %2378 = torch.aten.unsqueeze %2377, %int0_2250 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2378, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2251 = torch.constant.int 1
    %int0_2252 = torch.constant.int 0
    %int9223372036854775807_2253 = torch.constant.int 9223372036854775807
    %int1_2254 = torch.constant.int 1
    %2379 = torch.aten.slice.Tensor %2378, %int1_2251, %int0_2252, %int9223372036854775807_2253, %int1_2254 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2379, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_2255 = torch.constant.int 2
    %int0_2256 = torch.constant.int 0
    %int9223372036854775807_2257 = torch.constant.int 9223372036854775807
    %int1_2258 = torch.constant.int 1
    %2380 = torch.aten.slice.Tensor %2379, %int2_2255, %int0_2256, %int9223372036854775807_2257, %int1_2258 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2380, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2259 = torch.constant.int 1
    %int1_2260 = torch.constant.int 1
    %int1_2261 = torch.constant.int 1
    %2381 = torch.prim.ListConstruct %int1_2259, %int1_2260, %int1_2261 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2382 = torch.aten.repeat %2380, %2381 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2382, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_2262 = torch.constant.int 6
    %2383 = torch.prims.convert_element_type %2356, %int6_2262 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2383, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2384 = torch_c.to_builtin_tensor %2383 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %2385 = torch_c.to_builtin_tensor %2382 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2386 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2384, %2385) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %2387 = torch_c.from_builtin_tensor %2386 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2387, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2263 = torch.constant.int 15
    %2388 = torch.prims.convert_element_type %2387, %int15_2263 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2388, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_2264 = torch.constant.int 131072
    %none_2265 = torch.constant.none
    %none_2266 = torch.constant.none
    %cpu_2267 = torch.constant.device "cpu"
    %false_2268 = torch.constant.bool false
    %2389 = torch.aten.arange %int131072_2264, %none_2265, %none_2266, %cpu_2267, %false_2268 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2269 = torch.constant.int 0
    %int128_2270 = torch.constant.int 128
    %none_2271 = torch.constant.none
    %none_2272 = torch.constant.none
    %cpu_2273 = torch.constant.device "cpu"
    %false_2274 = torch.constant.bool false
    %2390 = torch.aten.arange.start %int0_2269, %int128_2270, %none_2271, %none_2272, %cpu_2273, %false_2274 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_2275 = torch.constant.int 2
    %2391 = torch.aten.floor_divide.Scalar %2390, %int2_2275 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_2276 = torch.constant.int 6
    %2392 = torch.prims.convert_element_type %2391, %int6_2276 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_2277 = torch.constant.int 128
    %2393 = torch.aten.div.Scalar %2392, %int128_2277 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_2278 = torch.constant.float 2.000000e+00
    %2394 = torch.aten.mul.Scalar %2393, %float2.000000e00_2278 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_2279 = torch.constant.float 5.000000e+05
    %2395 = torch.aten.pow.Scalar %float5.000000e05_2279, %2394 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2396 = torch.aten.reciprocal %2395 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_2280 = torch.constant.float 1.000000e+00
    %2397 = torch.aten.mul.Scalar %2396, %float1.000000e00_2280 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_2281 = torch.constant.int 131072
    %int1_2282 = torch.constant.int 1
    %2398 = torch.prim.ListConstruct %int131072_2281, %int1_2282 : (!torch.int, !torch.int) -> !torch.list<int>
    %2399 = torch.aten.view %2389, %2398 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2400 = torch.aten.mul.Tensor %2399, %2397 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_2283 = torch.constant.int 1
    %2401 = torch.aten.size.int %2343, %int1_2283 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2284 = torch.constant.int 0
    %2402 = torch.aten.add.int %int0_2284, %2401 : !torch.int, !torch.int -> !torch.int
    %int0_2285 = torch.constant.int 0
    %int0_2286 = torch.constant.int 0
    %int1_2287 = torch.constant.int 1
    %2403 = torch.aten.slice.Tensor %2400, %int0_2285, %int0_2286, %2402, %int1_2287 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2403, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2288 = torch.constant.int 1
    %int0_2289 = torch.constant.int 0
    %int9223372036854775807_2290 = torch.constant.int 9223372036854775807
    %int1_2291 = torch.constant.int 1
    %2404 = torch.aten.slice.Tensor %2403, %int1_2288, %int0_2289, %int9223372036854775807_2290, %int1_2291 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2404, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2292 = torch.constant.int 1
    %int0_2293 = torch.constant.int 0
    %int9223372036854775807_2294 = torch.constant.int 9223372036854775807
    %int1_2295 = torch.constant.int 1
    %2405 = torch.aten.slice.Tensor %2404, %int1_2292, %int0_2293, %int9223372036854775807_2294, %int1_2295 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2405, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_2296 = torch.constant.int 0
    %2406 = torch.aten.unsqueeze %2405, %int0_2296 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2406, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2297 = torch.constant.int 1
    %int0_2298 = torch.constant.int 0
    %int9223372036854775807_2299 = torch.constant.int 9223372036854775807
    %int1_2300 = torch.constant.int 1
    %2407 = torch.aten.slice.Tensor %2406, %int1_2297, %int0_2298, %int9223372036854775807_2299, %int1_2300 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2407, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_2301 = torch.constant.int 2
    %int0_2302 = torch.constant.int 0
    %int9223372036854775807_2303 = torch.constant.int 9223372036854775807
    %int1_2304 = torch.constant.int 1
    %2408 = torch.aten.slice.Tensor %2407, %int2_2301, %int0_2302, %int9223372036854775807_2303, %int1_2304 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2408, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2305 = torch.constant.int 1
    %int1_2306 = torch.constant.int 1
    %int1_2307 = torch.constant.int 1
    %2409 = torch.prim.ListConstruct %int1_2305, %int1_2306, %int1_2307 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2410 = torch.aten.repeat %2408, %2409 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2410, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_2308 = torch.constant.int 6
    %2411 = torch.prims.convert_element_type %2358, %int6_2308 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2411, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %2412 = torch_c.to_builtin_tensor %2411 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %2413 = torch_c.to_builtin_tensor %2410 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2414 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2412, %2413) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %2415 = torch_c.from_builtin_tensor %2414 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2415, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_2309 = torch.constant.int 15
    %2416 = torch.prims.convert_element_type %2415, %int15_2309 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2416, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %2417 = torch.aten.div.Tensor %2416, %144 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2417, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2310 = torch.constant.float -2.400000e+02
    %float2.400000e02_2311 = torch.constant.float 2.400000e+02
    %2418 = torch.aten.clamp %2417, %float-2.400000e02_2310, %float2.400000e02_2311 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2418, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2312 = torch.constant.int 26
    %2419 = torch.prims.convert_element_type %2418, %int26_2312 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2419, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %2420 = torch.aten.div.Tensor %2360, %144 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2420, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2313 = torch.constant.float -2.400000e+02
    %float2.400000e02_2314 = torch.constant.float 2.400000e+02
    %2421 = torch.aten.clamp %2420, %float-2.400000e02_2313, %float2.400000e02_2314 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2421, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2315 = torch.constant.int 26
    %2422 = torch.prims.convert_element_type %2421, %int26_2315 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2422, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_2316 = torch.constant.int 64
    %2423 = torch.aten.mul.Scalar %arg2, %int64_2316 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2423, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int16 = torch.constant.int 16
    %int1_2317 = torch.constant.int 1
    %2424 = torch.aten.add.Scalar %2423, %int16, %int1_2317 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2424, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_2318 = torch.constant.int 1
    %int32_2319 = torch.constant.int 32
    %int8_2320 = torch.constant.int 8
    %int128_2321 = torch.constant.int 128
    %2425 = torch.prim.ListConstruct %int1_2318, %670, %int32_2319, %int8_2320, %int128_2321 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2426 = torch.aten.view %2419, %2425 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2426, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2322 = torch.constant.int 32
    %int8_2323 = torch.constant.int 8
    %int128_2324 = torch.constant.int 128
    %2427 = torch.prim.ListConstruct %670, %int32_2322, %int8_2323, %int128_2324 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2428 = torch.aten.view %2426, %2427 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2428, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2429 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2430 = torch.aten.view %2424, %2429 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2430, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_2325 = torch.constant.int 32
    %int2_2326 = torch.constant.int 2
    %int32_2327 = torch.constant.int 32
    %int8_2328 = torch.constant.int 8
    %int128_2329 = torch.constant.int 128
    %2431 = torch.prim.ListConstruct %661, %int32_2325, %int2_2326, %int32_2327, %int8_2328, %int128_2329 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2432 = torch.aten.view %2239, %2431 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2432, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2330 = torch.constant.int 32
    %2433 = torch.aten.mul.int %661, %int32_2330 : !torch.int, !torch.int -> !torch.int
    %int2_2331 = torch.constant.int 2
    %2434 = torch.aten.mul.int %2433, %int2_2331 : !torch.int, !torch.int -> !torch.int
    %int32_2332 = torch.constant.int 32
    %int8_2333 = torch.constant.int 8
    %int128_2334 = torch.constant.int 128
    %2435 = torch.prim.ListConstruct %2434, %int32_2332, %int8_2333, %int128_2334 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2436 = torch.aten.view %2432, %2435 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2436, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %2437 = torch.prim.ListConstruct %2430 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2335 = torch.constant.bool false
    %2438 = torch.aten.index_put %2436, %2437, %2428, %false_2335 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2438, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2336 = torch.constant.int 32
    %int2_2337 = torch.constant.int 2
    %int32_2338 = torch.constant.int 32
    %int8_2339 = torch.constant.int 8
    %int128_2340 = torch.constant.int 128
    %2439 = torch.prim.ListConstruct %661, %int32_2336, %int2_2337, %int32_2338, %int8_2339, %int128_2340 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2440 = torch.aten.view %2438, %2439 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2440, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2341 = torch.constant.int 2097152
    %2441 = torch.prim.ListConstruct %661, %int2097152_2341 : (!torch.int, !torch.int) -> !torch.list<int>
    %2442 = torch.aten.view %2440, %2441 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2442, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_2342 = torch.constant.int 32
    %int2_2343 = torch.constant.int 2
    %int32_2344 = torch.constant.int 32
    %int8_2345 = torch.constant.int 8
    %int128_2346 = torch.constant.int 128
    %2443 = torch.prim.ListConstruct %661, %int32_2342, %int2_2343, %int32_2344, %int8_2345, %int128_2346 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2444 = torch.aten.view %2442, %2443 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2444, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2347 = torch.constant.int 32
    %int8_2348 = torch.constant.int 8
    %int128_2349 = torch.constant.int 128
    %2445 = torch.prim.ListConstruct %2434, %int32_2347, %int8_2348, %int128_2349 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2446 = torch.aten.view %2444, %2445 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2446, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_2350 = torch.constant.int 1
    %int32_2351 = torch.constant.int 32
    %int8_2352 = torch.constant.int 8
    %int128_2353 = torch.constant.int 128
    %2447 = torch.prim.ListConstruct %int1_2350, %670, %int32_2351, %int8_2352, %int128_2353 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2448 = torch.aten.view %2422, %2447 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2448, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2354 = torch.constant.int 32
    %int8_2355 = torch.constant.int 8
    %int128_2356 = torch.constant.int 128
    %2449 = torch.prim.ListConstruct %670, %int32_2354, %int8_2355, %int128_2356 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2450 = torch.aten.view %2448, %2449 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2450, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2357 = torch.constant.int 1
    %int1_2358 = torch.constant.int 1
    %2451 = torch.aten.add.Scalar %2424, %int1_2357, %int1_2358 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2451, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2452 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2453 = torch.aten.view %2451, %2452 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2453, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2454 = torch.prim.ListConstruct %2453 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2359 = torch.constant.bool false
    %2455 = torch.aten.index_put %2446, %2454, %2450, %false_2359 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2455, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2360 = torch.constant.int 32
    %int2_2361 = torch.constant.int 2
    %int32_2362 = torch.constant.int 32
    %int8_2363 = torch.constant.int 8
    %int128_2364 = torch.constant.int 128
    %2456 = torch.prim.ListConstruct %661, %int32_2360, %int2_2361, %int32_2362, %int8_2363, %int128_2364 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2457 = torch.aten.view %2455, %2456 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2457, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2365 = torch.constant.int 2097152
    %2458 = torch.prim.ListConstruct %661, %int2097152_2365 : (!torch.int, !torch.int) -> !torch.list<int>
    %2459 = torch.aten.view %2457, %2458 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2459, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_2366 = torch.constant.int -2
    %2460 = torch.aten.unsqueeze %2419, %int-2_2366 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2460, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2367 = torch.constant.int 1
    %int8_2368 = torch.constant.int 8
    %int4_2369 = torch.constant.int 4
    %int128_2370 = torch.constant.int 128
    %2461 = torch.prim.ListConstruct %int1_2367, %2401, %int8_2368, %int4_2369, %int128_2370 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2371 = torch.constant.bool false
    %2462 = torch.aten.expand %2460, %2461, %false_2371 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2462, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2372 = torch.constant.int 0
    %2463 = torch.aten.clone %2462, %int0_2372 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2463, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2373 = torch.constant.int 1
    %int32_2374 = torch.constant.int 32
    %int128_2375 = torch.constant.int 128
    %2464 = torch.prim.ListConstruct %int1_2373, %2401, %int32_2374, %int128_2375 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2465 = torch.aten._unsafe_view %2463, %2464 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2465, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_2376 = torch.constant.int -2
    %2466 = torch.aten.unsqueeze %2422, %int-2_2376 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2466, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2377 = torch.constant.int 1
    %2467 = torch.aten.size.int %2353, %int1_2377 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_2378 = torch.constant.int 1
    %int8_2379 = torch.constant.int 8
    %int4_2380 = torch.constant.int 4
    %int128_2381 = torch.constant.int 128
    %2468 = torch.prim.ListConstruct %int1_2378, %2467, %int8_2379, %int4_2380, %int128_2381 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2382 = torch.constant.bool false
    %2469 = torch.aten.expand %2466, %2468, %false_2382 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2469, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2383 = torch.constant.int 0
    %2470 = torch.aten.clone %2469, %int0_2383 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2470, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2384 = torch.constant.int 1
    %int32_2385 = torch.constant.int 32
    %int128_2386 = torch.constant.int 128
    %2471 = torch.prim.ListConstruct %int1_2384, %2467, %int32_2385, %int128_2386 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2472 = torch.aten._unsafe_view %2470, %2471 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2472, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_2387 = torch.constant.int 6
    %2473 = torch.prims.convert_element_type %2465, %int6_2387 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2473, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2474 = torch.aten.mul.Tensor %2473, %144 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2474, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2388 = torch.constant.int 15
    %2475 = torch.prims.convert_element_type %2474, %int15_2388 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2475, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_2389 = torch.constant.int 6
    %2476 = torch.prims.convert_element_type %2472, %int6_2389 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2476, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2477 = torch.aten.mul.Tensor %2476, %144 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2477, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2390 = torch.constant.int 15
    %2478 = torch.prims.convert_element_type %2477, %int15_2390 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2478, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2391 = torch.constant.int 1
    %int2_2392 = torch.constant.int 2
    %2479 = torch.aten.transpose.int %2388, %int1_2391, %int2_2392 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2479, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2393 = torch.constant.int 1
    %int2_2394 = torch.constant.int 2
    %2480 = torch.aten.transpose.int %2475, %int1_2393, %int2_2394 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2480, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2395 = torch.constant.int 1
    %int2_2396 = torch.constant.int 2
    %2481 = torch.aten.transpose.int %2478, %int1_2395, %int2_2396 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2481, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_2397 = torch.constant.float 0.000000e+00
    %true_2398 = torch.constant.bool true
    %none_2399 = torch.constant.none
    %none_2400 = torch.constant.none
    %2482:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2479, %2480, %2481, %float0.000000e00_2397, %true_2398, %none_2399, %none_2400) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2482#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2401 = torch.constant.int 1
    %int2_2402 = torch.constant.int 2
    %2483 = torch.aten.transpose.int %2482#0, %int1_2401, %int2_2402 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2483, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2403 = torch.constant.int 1
    %int4096_2404 = torch.constant.int 4096
    %2484 = torch.prim.ListConstruct %int1_2403, %2373, %int4096_2404 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2485 = torch.aten.view %2483, %2484 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2485, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2486 = torch.aten.div.Tensor %2485, %145 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2486, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2405 = torch.constant.float -2.400000e+02
    %float2.400000e02_2406 = torch.constant.float 2.400000e+02
    %2487 = torch.aten.clamp %2486, %float-2.400000e02_2405, %float2.400000e02_2406 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2487, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2407 = torch.constant.int 26
    %2488 = torch.prims.convert_element_type %2487, %int26_2407 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2488, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2408 = torch.constant.int -2
    %int-1_2409 = torch.constant.int -1
    %2489 = torch.aten.transpose.int %146, %int-2_2408, %int-1_2409 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2410 = torch.constant.int 4096
    %2490 = torch.prim.ListConstruct %2373, %int4096_2410 : (!torch.int, !torch.int) -> !torch.list<int>
    %2491 = torch.aten.view %2488, %2490 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2491, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2492 = torch.aten.mm %2491, %2489 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2492, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2411 = torch.constant.int 1
    %int4096_2412 = torch.constant.int 4096
    %2493 = torch.prim.ListConstruct %int1_2411, %2373, %int4096_2412 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2494 = torch.aten.view %2492, %2493 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2494, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2413 = torch.constant.int 15
    %2495 = torch.prims.convert_element_type %2494, %int15_2413 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2495, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2414 = torch.constant.int 1
    %2496 = torch.aten.add.Tensor %2317, %2495, %int1_2414 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2496, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2415 = torch.constant.int 2
    %2497 = torch.aten.pow.Tensor_Scalar %2496, %int2_2415 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2497, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2416 = torch.constant.int -1
    %2498 = torch.prim.ListConstruct %int-1_2416 : (!torch.int) -> !torch.list<int>
    %true_2417 = torch.constant.bool true
    %none_2418 = torch.constant.none
    %2499 = torch.aten.mean.dim %2497, %2498, %true_2417, %none_2418 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2499, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2419 = torch.constant.float 1.000000e-05
    %int1_2420 = torch.constant.int 1
    %2500 = torch.aten.add.Scalar %2499, %float1.000000e-05_2419, %int1_2420 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2500, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2501 = torch.aten.rsqrt %2500 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2501, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2502 = torch.aten.mul.Tensor %2496, %2501 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2502, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2503 = torch.aten.mul.Tensor %147, %2502 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2503, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2504 = torch.aten.div.Tensor %2503, %148 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2504, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2421 = torch.constant.float -2.400000e+02
    %float2.400000e02_2422 = torch.constant.float 2.400000e+02
    %2505 = torch.aten.clamp %2504, %float-2.400000e02_2421, %float2.400000e02_2422 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2505, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2423 = torch.constant.int 26
    %2506 = torch.prims.convert_element_type %2505, %int26_2423 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2506, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2424 = torch.constant.int -2
    %int-1_2425 = torch.constant.int -1
    %2507 = torch.aten.transpose.int %149, %int-2_2424, %int-1_2425 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2426 = torch.constant.int 4096
    %2508 = torch.prim.ListConstruct %566, %int4096_2426 : (!torch.int, !torch.int) -> !torch.list<int>
    %2509 = torch.aten.view %2506, %2508 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2509, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2510 = torch.aten.mm %2509, %2507 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2510, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2427 = torch.constant.int 1
    %int14336_2428 = torch.constant.int 14336
    %2511 = torch.prim.ListConstruct %int1_2427, %566, %int14336_2428 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2512 = torch.aten.view %2510, %2511 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2512, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2429 = torch.constant.int 15
    %2513 = torch.prims.convert_element_type %2512, %int15_2429 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2513, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2514 = torch.aten.silu %2513 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2514, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2515 = torch.aten.div.Tensor %2503, %150 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2515, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2430 = torch.constant.float -2.400000e+02
    %float2.400000e02_2431 = torch.constant.float 2.400000e+02
    %2516 = torch.aten.clamp %2515, %float-2.400000e02_2430, %float2.400000e02_2431 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2516, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2432 = torch.constant.int 26
    %2517 = torch.prims.convert_element_type %2516, %int26_2432 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2517, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2433 = torch.constant.int -2
    %int-1_2434 = torch.constant.int -1
    %2518 = torch.aten.transpose.int %151, %int-2_2433, %int-1_2434 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2435 = torch.constant.int 4096
    %2519 = torch.prim.ListConstruct %566, %int4096_2435 : (!torch.int, !torch.int) -> !torch.list<int>
    %2520 = torch.aten.view %2517, %2519 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2520, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2521 = torch.aten.mm %2520, %2518 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2521, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2436 = torch.constant.int 1
    %int14336_2437 = torch.constant.int 14336
    %2522 = torch.prim.ListConstruct %int1_2436, %566, %int14336_2437 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2523 = torch.aten.view %2521, %2522 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2523, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2438 = torch.constant.int 15
    %2524 = torch.prims.convert_element_type %2523, %int15_2438 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2524, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2525 = torch.aten.mul.Tensor %2514, %2524 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2525, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2526 = torch.aten.div.Tensor %2525, %152 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2526, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_2439 = torch.constant.float -2.400000e+02
    %float2.400000e02_2440 = torch.constant.float 2.400000e+02
    %2527 = torch.aten.clamp %2526, %float-2.400000e02_2439, %float2.400000e02_2440 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2527, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_2441 = torch.constant.int 26
    %2528 = torch.prims.convert_element_type %2527, %int26_2441 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2528, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_2442 = torch.constant.int -2
    %int-1_2443 = torch.constant.int -1
    %2529 = torch.aten.transpose.int %153, %int-2_2442, %int-1_2443 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_2444 = torch.constant.int 1
    %2530 = torch.aten.size.int %2512, %int1_2444 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_2445 = torch.constant.int 14336
    %2531 = torch.prim.ListConstruct %2530, %int14336_2445 : (!torch.int, !torch.int) -> !torch.list<int>
    %2532 = torch.aten.view %2528, %2531 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2532, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2533 = torch.aten.mm %2532, %2529 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2533, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2446 = torch.constant.int 1
    %int4096_2447 = torch.constant.int 4096
    %2534 = torch.prim.ListConstruct %int1_2446, %2530, %int4096_2447 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2535 = torch.aten.view %2533, %2534 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2535, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2448 = torch.constant.int 15
    %2536 = torch.prims.convert_element_type %2535, %int15_2448 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2536, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2449 = torch.constant.int 1
    %2537 = torch.aten.add.Tensor %2496, %2536, %int1_2449 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2537, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2450 = torch.constant.int 2
    %2538 = torch.aten.pow.Tensor_Scalar %2537, %int2_2450 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2538, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2451 = torch.constant.int -1
    %2539 = torch.prim.ListConstruct %int-1_2451 : (!torch.int) -> !torch.list<int>
    %true_2452 = torch.constant.bool true
    %none_2453 = torch.constant.none
    %2540 = torch.aten.mean.dim %2538, %2539, %true_2452, %none_2453 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2540, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2454 = torch.constant.float 1.000000e-05
    %int1_2455 = torch.constant.int 1
    %2541 = torch.aten.add.Scalar %2540, %float1.000000e-05_2454, %int1_2455 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2541, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2542 = torch.aten.rsqrt %2541 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2542, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2543 = torch.aten.mul.Tensor %2537, %2542 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2543, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2544 = torch.aten.mul.Tensor %154, %2543 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2544, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2545 = torch.aten.div.Tensor %2544, %155 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2545, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2456 = torch.constant.float -2.400000e+02
    %float2.400000e02_2457 = torch.constant.float 2.400000e+02
    %2546 = torch.aten.clamp %2545, %float-2.400000e02_2456, %float2.400000e02_2457 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2546, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2458 = torch.constant.int 26
    %2547 = torch.prims.convert_element_type %2546, %int26_2458 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2547, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2459 = torch.constant.int -2
    %int-1_2460 = torch.constant.int -1
    %2548 = torch.aten.transpose.int %156, %int-2_2459, %int-1_2460 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2461 = torch.constant.int 4096
    %2549 = torch.prim.ListConstruct %566, %int4096_2461 : (!torch.int, !torch.int) -> !torch.list<int>
    %2550 = torch.aten.view %2547, %2549 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2550, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2551 = torch.aten.mm %2550, %2548 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2551, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2462 = torch.constant.int 1
    %int4096_2463 = torch.constant.int 4096
    %2552 = torch.prim.ListConstruct %int1_2462, %566, %int4096_2463 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2553 = torch.aten.view %2551, %2552 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2553, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2464 = torch.constant.int 15
    %2554 = torch.prims.convert_element_type %2553, %int15_2464 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2554, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2555 = torch.aten.div.Tensor %2544, %157 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2555, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2465 = torch.constant.float -2.400000e+02
    %float2.400000e02_2466 = torch.constant.float 2.400000e+02
    %2556 = torch.aten.clamp %2555, %float-2.400000e02_2465, %float2.400000e02_2466 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2556, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2467 = torch.constant.int 26
    %2557 = torch.prims.convert_element_type %2556, %int26_2467 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2557, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2468 = torch.constant.int -2
    %int-1_2469 = torch.constant.int -1
    %2558 = torch.aten.transpose.int %158, %int-2_2468, %int-1_2469 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2470 = torch.constant.int 4096
    %2559 = torch.prim.ListConstruct %566, %int4096_2470 : (!torch.int, !torch.int) -> !torch.list<int>
    %2560 = torch.aten.view %2557, %2559 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2560, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2561 = torch.aten.mm %2560, %2558 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2561, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2471 = torch.constant.int 1
    %int1024_2472 = torch.constant.int 1024
    %2562 = torch.prim.ListConstruct %int1_2471, %566, %int1024_2472 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2563 = torch.aten.view %2561, %2562 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2563, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2473 = torch.constant.int 15
    %2564 = torch.prims.convert_element_type %2563, %int15_2473 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2564, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %2565 = torch.aten.div.Tensor %2544, %159 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2565, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2474 = torch.constant.float -2.400000e+02
    %float2.400000e02_2475 = torch.constant.float 2.400000e+02
    %2566 = torch.aten.clamp %2565, %float-2.400000e02_2474, %float2.400000e02_2475 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2566, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2476 = torch.constant.int 26
    %2567 = torch.prims.convert_element_type %2566, %int26_2476 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2567, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2477 = torch.constant.int -2
    %int-1_2478 = torch.constant.int -1
    %2568 = torch.aten.transpose.int %160, %int-2_2477, %int-1_2478 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2479 = torch.constant.int 4096
    %2569 = torch.prim.ListConstruct %566, %int4096_2479 : (!torch.int, !torch.int) -> !torch.list<int>
    %2570 = torch.aten.view %2567, %2569 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2570, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2571 = torch.aten.mm %2570, %2568 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2571, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2480 = torch.constant.int 1
    %int1024_2481 = torch.constant.int 1024
    %2572 = torch.prim.ListConstruct %int1_2480, %566, %int1024_2481 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2573 = torch.aten.view %2571, %2572 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2573, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2482 = torch.constant.int 15
    %2574 = torch.prims.convert_element_type %2573, %int15_2482 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2574, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_2483 = torch.constant.int 1
    %int32_2484 = torch.constant.int 32
    %int128_2485 = torch.constant.int 128
    %2575 = torch.prim.ListConstruct %int1_2483, %566, %int32_2484, %int128_2485 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2576 = torch.aten.view %2554, %2575 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2576, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2486 = torch.constant.int 1
    %int8_2487 = torch.constant.int 8
    %int128_2488 = torch.constant.int 128
    %2577 = torch.prim.ListConstruct %int1_2486, %566, %int8_2487, %int128_2488 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2578 = torch.aten.view %2564, %2577 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2578, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_2489 = torch.constant.int 1
    %int8_2490 = torch.constant.int 8
    %int128_2491 = torch.constant.int 128
    %2579 = torch.prim.ListConstruct %int1_2489, %566, %int8_2490, %int128_2491 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2580 = torch.aten.view %2574, %2579 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2580, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_2492 = torch.constant.int 131072
    %none_2493 = torch.constant.none
    %none_2494 = torch.constant.none
    %cpu_2495 = torch.constant.device "cpu"
    %false_2496 = torch.constant.bool false
    %2581 = torch.aten.arange %int131072_2492, %none_2493, %none_2494, %cpu_2495, %false_2496 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2497 = torch.constant.int 0
    %int128_2498 = torch.constant.int 128
    %none_2499 = torch.constant.none
    %none_2500 = torch.constant.none
    %cpu_2501 = torch.constant.device "cpu"
    %false_2502 = torch.constant.bool false
    %2582 = torch.aten.arange.start %int0_2497, %int128_2498, %none_2499, %none_2500, %cpu_2501, %false_2502 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_2503 = torch.constant.int 2
    %2583 = torch.aten.floor_divide.Scalar %2582, %int2_2503 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_2504 = torch.constant.int 6
    %2584 = torch.prims.convert_element_type %2583, %int6_2504 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_2505 = torch.constant.int 128
    %2585 = torch.aten.div.Scalar %2584, %int128_2505 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_2506 = torch.constant.float 2.000000e+00
    %2586 = torch.aten.mul.Scalar %2585, %float2.000000e00_2506 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_2507 = torch.constant.float 5.000000e+05
    %2587 = torch.aten.pow.Scalar %float5.000000e05_2507, %2586 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2588 = torch.aten.reciprocal %2587 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_2508 = torch.constant.float 1.000000e+00
    %2589 = torch.aten.mul.Scalar %2588, %float1.000000e00_2508 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_2509 = torch.constant.int 131072
    %int1_2510 = torch.constant.int 1
    %2590 = torch.prim.ListConstruct %int131072_2509, %int1_2510 : (!torch.int, !torch.int) -> !torch.list<int>
    %2591 = torch.aten.view %2581, %2590 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2592 = torch.aten.mul.Tensor %2591, %2589 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_2511 = torch.constant.int 1
    %2593 = torch.aten.size.int %2553, %int1_2511 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2512 = torch.constant.int 0
    %2594 = torch.aten.add.int %int0_2512, %2593 : !torch.int, !torch.int -> !torch.int
    %int0_2513 = torch.constant.int 0
    %int0_2514 = torch.constant.int 0
    %int1_2515 = torch.constant.int 1
    %2595 = torch.aten.slice.Tensor %2592, %int0_2513, %int0_2514, %2594, %int1_2515 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2595, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2516 = torch.constant.int 1
    %int0_2517 = torch.constant.int 0
    %int9223372036854775807_2518 = torch.constant.int 9223372036854775807
    %int1_2519 = torch.constant.int 1
    %2596 = torch.aten.slice.Tensor %2595, %int1_2516, %int0_2517, %int9223372036854775807_2518, %int1_2519 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2596, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2520 = torch.constant.int 1
    %int0_2521 = torch.constant.int 0
    %int9223372036854775807_2522 = torch.constant.int 9223372036854775807
    %int1_2523 = torch.constant.int 1
    %2597 = torch.aten.slice.Tensor %2596, %int1_2520, %int0_2521, %int9223372036854775807_2522, %int1_2523 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2597, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_2524 = torch.constant.int 0
    %2598 = torch.aten.unsqueeze %2597, %int0_2524 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2598, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2525 = torch.constant.int 1
    %int0_2526 = torch.constant.int 0
    %int9223372036854775807_2527 = torch.constant.int 9223372036854775807
    %int1_2528 = torch.constant.int 1
    %2599 = torch.aten.slice.Tensor %2598, %int1_2525, %int0_2526, %int9223372036854775807_2527, %int1_2528 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2599, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_2529 = torch.constant.int 2
    %int0_2530 = torch.constant.int 0
    %int9223372036854775807_2531 = torch.constant.int 9223372036854775807
    %int1_2532 = torch.constant.int 1
    %2600 = torch.aten.slice.Tensor %2599, %int2_2529, %int0_2530, %int9223372036854775807_2531, %int1_2532 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2600, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2533 = torch.constant.int 1
    %int1_2534 = torch.constant.int 1
    %int1_2535 = torch.constant.int 1
    %2601 = torch.prim.ListConstruct %int1_2533, %int1_2534, %int1_2535 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2602 = torch.aten.repeat %2600, %2601 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2602, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_2536 = torch.constant.int 6
    %2603 = torch.prims.convert_element_type %2576, %int6_2536 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2603, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2604 = torch_c.to_builtin_tensor %2603 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %2605 = torch_c.to_builtin_tensor %2602 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2606 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2604, %2605) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %2607 = torch_c.from_builtin_tensor %2606 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2607, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2537 = torch.constant.int 15
    %2608 = torch.prims.convert_element_type %2607, %int15_2537 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2608, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_2538 = torch.constant.int 131072
    %none_2539 = torch.constant.none
    %none_2540 = torch.constant.none
    %cpu_2541 = torch.constant.device "cpu"
    %false_2542 = torch.constant.bool false
    %2609 = torch.aten.arange %int131072_2538, %none_2539, %none_2540, %cpu_2541, %false_2542 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2543 = torch.constant.int 0
    %int128_2544 = torch.constant.int 128
    %none_2545 = torch.constant.none
    %none_2546 = torch.constant.none
    %cpu_2547 = torch.constant.device "cpu"
    %false_2548 = torch.constant.bool false
    %2610 = torch.aten.arange.start %int0_2543, %int128_2544, %none_2545, %none_2546, %cpu_2547, %false_2548 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_2549 = torch.constant.int 2
    %2611 = torch.aten.floor_divide.Scalar %2610, %int2_2549 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_2550 = torch.constant.int 6
    %2612 = torch.prims.convert_element_type %2611, %int6_2550 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_2551 = torch.constant.int 128
    %2613 = torch.aten.div.Scalar %2612, %int128_2551 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_2552 = torch.constant.float 2.000000e+00
    %2614 = torch.aten.mul.Scalar %2613, %float2.000000e00_2552 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_2553 = torch.constant.float 5.000000e+05
    %2615 = torch.aten.pow.Scalar %float5.000000e05_2553, %2614 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2616 = torch.aten.reciprocal %2615 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_2554 = torch.constant.float 1.000000e+00
    %2617 = torch.aten.mul.Scalar %2616, %float1.000000e00_2554 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_2555 = torch.constant.int 131072
    %int1_2556 = torch.constant.int 1
    %2618 = torch.prim.ListConstruct %int131072_2555, %int1_2556 : (!torch.int, !torch.int) -> !torch.list<int>
    %2619 = torch.aten.view %2609, %2618 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2620 = torch.aten.mul.Tensor %2619, %2617 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_2557 = torch.constant.int 1
    %2621 = torch.aten.size.int %2563, %int1_2557 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2558 = torch.constant.int 0
    %2622 = torch.aten.add.int %int0_2558, %2621 : !torch.int, !torch.int -> !torch.int
    %int0_2559 = torch.constant.int 0
    %int0_2560 = torch.constant.int 0
    %int1_2561 = torch.constant.int 1
    %2623 = torch.aten.slice.Tensor %2620, %int0_2559, %int0_2560, %2622, %int1_2561 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2623, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2562 = torch.constant.int 1
    %int0_2563 = torch.constant.int 0
    %int9223372036854775807_2564 = torch.constant.int 9223372036854775807
    %int1_2565 = torch.constant.int 1
    %2624 = torch.aten.slice.Tensor %2623, %int1_2562, %int0_2563, %int9223372036854775807_2564, %int1_2565 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2624, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2566 = torch.constant.int 1
    %int0_2567 = torch.constant.int 0
    %int9223372036854775807_2568 = torch.constant.int 9223372036854775807
    %int1_2569 = torch.constant.int 1
    %2625 = torch.aten.slice.Tensor %2624, %int1_2566, %int0_2567, %int9223372036854775807_2568, %int1_2569 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2625, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_2570 = torch.constant.int 0
    %2626 = torch.aten.unsqueeze %2625, %int0_2570 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2626, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2571 = torch.constant.int 1
    %int0_2572 = torch.constant.int 0
    %int9223372036854775807_2573 = torch.constant.int 9223372036854775807
    %int1_2574 = torch.constant.int 1
    %2627 = torch.aten.slice.Tensor %2626, %int1_2571, %int0_2572, %int9223372036854775807_2573, %int1_2574 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2627, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_2575 = torch.constant.int 2
    %int0_2576 = torch.constant.int 0
    %int9223372036854775807_2577 = torch.constant.int 9223372036854775807
    %int1_2578 = torch.constant.int 1
    %2628 = torch.aten.slice.Tensor %2627, %int2_2575, %int0_2576, %int9223372036854775807_2577, %int1_2578 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2628, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2579 = torch.constant.int 1
    %int1_2580 = torch.constant.int 1
    %int1_2581 = torch.constant.int 1
    %2629 = torch.prim.ListConstruct %int1_2579, %int1_2580, %int1_2581 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2630 = torch.aten.repeat %2628, %2629 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2630, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_2582 = torch.constant.int 6
    %2631 = torch.prims.convert_element_type %2578, %int6_2582 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2631, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %2632 = torch_c.to_builtin_tensor %2631 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %2633 = torch_c.to_builtin_tensor %2630 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2634 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2632, %2633) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %2635 = torch_c.from_builtin_tensor %2634 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2635, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_2583 = torch.constant.int 15
    %2636 = torch.prims.convert_element_type %2635, %int15_2583 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2636, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %2637 = torch.aten.div.Tensor %2636, %161 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2637, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2584 = torch.constant.float -2.400000e+02
    %float2.400000e02_2585 = torch.constant.float 2.400000e+02
    %2638 = torch.aten.clamp %2637, %float-2.400000e02_2584, %float2.400000e02_2585 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2638, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2586 = torch.constant.int 26
    %2639 = torch.prims.convert_element_type %2638, %int26_2586 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2639, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %2640 = torch.aten.div.Tensor %2580, %161 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2640, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2587 = torch.constant.float -2.400000e+02
    %float2.400000e02_2588 = torch.constant.float 2.400000e+02
    %2641 = torch.aten.clamp %2640, %float-2.400000e02_2587, %float2.400000e02_2588 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2641, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2589 = torch.constant.int 26
    %2642 = torch.prims.convert_element_type %2641, %int26_2589 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2642, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_2590 = torch.constant.int 64
    %2643 = torch.aten.mul.Scalar %arg2, %int64_2590 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2643, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int18 = torch.constant.int 18
    %int1_2591 = torch.constant.int 1
    %2644 = torch.aten.add.Scalar %2643, %int18, %int1_2591 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2644, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_2592 = torch.constant.int 1
    %int32_2593 = torch.constant.int 32
    %int8_2594 = torch.constant.int 8
    %int128_2595 = torch.constant.int 128
    %2645 = torch.prim.ListConstruct %int1_2592, %670, %int32_2593, %int8_2594, %int128_2595 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2646 = torch.aten.view %2639, %2645 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2646, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2596 = torch.constant.int 32
    %int8_2597 = torch.constant.int 8
    %int128_2598 = torch.constant.int 128
    %2647 = torch.prim.ListConstruct %670, %int32_2596, %int8_2597, %int128_2598 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2648 = torch.aten.view %2646, %2647 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2648, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2649 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2650 = torch.aten.view %2644, %2649 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2650, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_2599 = torch.constant.int 32
    %int2_2600 = torch.constant.int 2
    %int32_2601 = torch.constant.int 32
    %int8_2602 = torch.constant.int 8
    %int128_2603 = torch.constant.int 128
    %2651 = torch.prim.ListConstruct %661, %int32_2599, %int2_2600, %int32_2601, %int8_2602, %int128_2603 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2652 = torch.aten.view %2459, %2651 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2652, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2604 = torch.constant.int 32
    %2653 = torch.aten.mul.int %661, %int32_2604 : !torch.int, !torch.int -> !torch.int
    %int2_2605 = torch.constant.int 2
    %2654 = torch.aten.mul.int %2653, %int2_2605 : !torch.int, !torch.int -> !torch.int
    %int32_2606 = torch.constant.int 32
    %int8_2607 = torch.constant.int 8
    %int128_2608 = torch.constant.int 128
    %2655 = torch.prim.ListConstruct %2654, %int32_2606, %int8_2607, %int128_2608 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2656 = torch.aten.view %2652, %2655 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2656, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %2657 = torch.prim.ListConstruct %2650 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2609 = torch.constant.bool false
    %2658 = torch.aten.index_put %2656, %2657, %2648, %false_2609 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2658, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2610 = torch.constant.int 32
    %int2_2611 = torch.constant.int 2
    %int32_2612 = torch.constant.int 32
    %int8_2613 = torch.constant.int 8
    %int128_2614 = torch.constant.int 128
    %2659 = torch.prim.ListConstruct %661, %int32_2610, %int2_2611, %int32_2612, %int8_2613, %int128_2614 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2660 = torch.aten.view %2658, %2659 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2660, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2615 = torch.constant.int 2097152
    %2661 = torch.prim.ListConstruct %661, %int2097152_2615 : (!torch.int, !torch.int) -> !torch.list<int>
    %2662 = torch.aten.view %2660, %2661 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2662, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_2616 = torch.constant.int 32
    %int2_2617 = torch.constant.int 2
    %int32_2618 = torch.constant.int 32
    %int8_2619 = torch.constant.int 8
    %int128_2620 = torch.constant.int 128
    %2663 = torch.prim.ListConstruct %661, %int32_2616, %int2_2617, %int32_2618, %int8_2619, %int128_2620 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2664 = torch.aten.view %2662, %2663 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2664, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2621 = torch.constant.int 32
    %int8_2622 = torch.constant.int 8
    %int128_2623 = torch.constant.int 128
    %2665 = torch.prim.ListConstruct %2654, %int32_2621, %int8_2622, %int128_2623 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2666 = torch.aten.view %2664, %2665 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2666, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_2624 = torch.constant.int 1
    %int32_2625 = torch.constant.int 32
    %int8_2626 = torch.constant.int 8
    %int128_2627 = torch.constant.int 128
    %2667 = torch.prim.ListConstruct %int1_2624, %670, %int32_2625, %int8_2626, %int128_2627 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2668 = torch.aten.view %2642, %2667 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2668, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2628 = torch.constant.int 32
    %int8_2629 = torch.constant.int 8
    %int128_2630 = torch.constant.int 128
    %2669 = torch.prim.ListConstruct %670, %int32_2628, %int8_2629, %int128_2630 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2670 = torch.aten.view %2668, %2669 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2670, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2631 = torch.constant.int 1
    %int1_2632 = torch.constant.int 1
    %2671 = torch.aten.add.Scalar %2644, %int1_2631, %int1_2632 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2671, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2672 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2673 = torch.aten.view %2671, %2672 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2673, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2674 = torch.prim.ListConstruct %2673 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2633 = torch.constant.bool false
    %2675 = torch.aten.index_put %2666, %2674, %2670, %false_2633 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2675, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2634 = torch.constant.int 32
    %int2_2635 = torch.constant.int 2
    %int32_2636 = torch.constant.int 32
    %int8_2637 = torch.constant.int 8
    %int128_2638 = torch.constant.int 128
    %2676 = torch.prim.ListConstruct %661, %int32_2634, %int2_2635, %int32_2636, %int8_2637, %int128_2638 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2677 = torch.aten.view %2675, %2676 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2677, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2639 = torch.constant.int 2097152
    %2678 = torch.prim.ListConstruct %661, %int2097152_2639 : (!torch.int, !torch.int) -> !torch.list<int>
    %2679 = torch.aten.view %2677, %2678 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2679, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_2640 = torch.constant.int -2
    %2680 = torch.aten.unsqueeze %2639, %int-2_2640 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2680, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2641 = torch.constant.int 1
    %int8_2642 = torch.constant.int 8
    %int4_2643 = torch.constant.int 4
    %int128_2644 = torch.constant.int 128
    %2681 = torch.prim.ListConstruct %int1_2641, %2621, %int8_2642, %int4_2643, %int128_2644 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2645 = torch.constant.bool false
    %2682 = torch.aten.expand %2680, %2681, %false_2645 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2682, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2646 = torch.constant.int 0
    %2683 = torch.aten.clone %2682, %int0_2646 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2683, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2647 = torch.constant.int 1
    %int32_2648 = torch.constant.int 32
    %int128_2649 = torch.constant.int 128
    %2684 = torch.prim.ListConstruct %int1_2647, %2621, %int32_2648, %int128_2649 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2685 = torch.aten._unsafe_view %2683, %2684 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2685, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_2650 = torch.constant.int -2
    %2686 = torch.aten.unsqueeze %2642, %int-2_2650 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2686, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2651 = torch.constant.int 1
    %2687 = torch.aten.size.int %2573, %int1_2651 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_2652 = torch.constant.int 1
    %int8_2653 = torch.constant.int 8
    %int4_2654 = torch.constant.int 4
    %int128_2655 = torch.constant.int 128
    %2688 = torch.prim.ListConstruct %int1_2652, %2687, %int8_2653, %int4_2654, %int128_2655 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2656 = torch.constant.bool false
    %2689 = torch.aten.expand %2686, %2688, %false_2656 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2689, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2657 = torch.constant.int 0
    %2690 = torch.aten.clone %2689, %int0_2657 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2690, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2658 = torch.constant.int 1
    %int32_2659 = torch.constant.int 32
    %int128_2660 = torch.constant.int 128
    %2691 = torch.prim.ListConstruct %int1_2658, %2687, %int32_2659, %int128_2660 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2692 = torch.aten._unsafe_view %2690, %2691 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2692, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_2661 = torch.constant.int 6
    %2693 = torch.prims.convert_element_type %2685, %int6_2661 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2693, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2694 = torch.aten.mul.Tensor %2693, %161 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2694, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2662 = torch.constant.int 15
    %2695 = torch.prims.convert_element_type %2694, %int15_2662 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2695, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_2663 = torch.constant.int 6
    %2696 = torch.prims.convert_element_type %2692, %int6_2663 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2696, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2697 = torch.aten.mul.Tensor %2696, %161 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2697, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2664 = torch.constant.int 15
    %2698 = torch.prims.convert_element_type %2697, %int15_2664 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2698, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2665 = torch.constant.int 1
    %int2_2666 = torch.constant.int 2
    %2699 = torch.aten.transpose.int %2608, %int1_2665, %int2_2666 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2699, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2667 = torch.constant.int 1
    %int2_2668 = torch.constant.int 2
    %2700 = torch.aten.transpose.int %2695, %int1_2667, %int2_2668 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2700, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2669 = torch.constant.int 1
    %int2_2670 = torch.constant.int 2
    %2701 = torch.aten.transpose.int %2698, %int1_2669, %int2_2670 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2701, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_2671 = torch.constant.float 0.000000e+00
    %true_2672 = torch.constant.bool true
    %none_2673 = torch.constant.none
    %none_2674 = torch.constant.none
    %2702:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2699, %2700, %2701, %float0.000000e00_2671, %true_2672, %none_2673, %none_2674) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2702#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2675 = torch.constant.int 1
    %int2_2676 = torch.constant.int 2
    %2703 = torch.aten.transpose.int %2702#0, %int1_2675, %int2_2676 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2703, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2677 = torch.constant.int 1
    %int4096_2678 = torch.constant.int 4096
    %2704 = torch.prim.ListConstruct %int1_2677, %2593, %int4096_2678 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2705 = torch.aten.view %2703, %2704 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2705, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2706 = torch.aten.div.Tensor %2705, %162 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2706, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2679 = torch.constant.float -2.400000e+02
    %float2.400000e02_2680 = torch.constant.float 2.400000e+02
    %2707 = torch.aten.clamp %2706, %float-2.400000e02_2679, %float2.400000e02_2680 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2707, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2681 = torch.constant.int 26
    %2708 = torch.prims.convert_element_type %2707, %int26_2681 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2708, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2682 = torch.constant.int -2
    %int-1_2683 = torch.constant.int -1
    %2709 = torch.aten.transpose.int %163, %int-2_2682, %int-1_2683 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2684 = torch.constant.int 4096
    %2710 = torch.prim.ListConstruct %2593, %int4096_2684 : (!torch.int, !torch.int) -> !torch.list<int>
    %2711 = torch.aten.view %2708, %2710 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2711, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2712 = torch.aten.mm %2711, %2709 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2712, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2685 = torch.constant.int 1
    %int4096_2686 = torch.constant.int 4096
    %2713 = torch.prim.ListConstruct %int1_2685, %2593, %int4096_2686 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2714 = torch.aten.view %2712, %2713 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2714, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2687 = torch.constant.int 15
    %2715 = torch.prims.convert_element_type %2714, %int15_2687 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2715, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2688 = torch.constant.int 1
    %2716 = torch.aten.add.Tensor %2537, %2715, %int1_2688 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2716, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2689 = torch.constant.int 2
    %2717 = torch.aten.pow.Tensor_Scalar %2716, %int2_2689 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2717, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2690 = torch.constant.int -1
    %2718 = torch.prim.ListConstruct %int-1_2690 : (!torch.int) -> !torch.list<int>
    %true_2691 = torch.constant.bool true
    %none_2692 = torch.constant.none
    %2719 = torch.aten.mean.dim %2717, %2718, %true_2691, %none_2692 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2719, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2693 = torch.constant.float 1.000000e-05
    %int1_2694 = torch.constant.int 1
    %2720 = torch.aten.add.Scalar %2719, %float1.000000e-05_2693, %int1_2694 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2720, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2721 = torch.aten.rsqrt %2720 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2721, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2722 = torch.aten.mul.Tensor %2716, %2721 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2722, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2723 = torch.aten.mul.Tensor %164, %2722 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2723, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2724 = torch.aten.div.Tensor %2723, %165 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2724, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2695 = torch.constant.float -2.400000e+02
    %float2.400000e02_2696 = torch.constant.float 2.400000e+02
    %2725 = torch.aten.clamp %2724, %float-2.400000e02_2695, %float2.400000e02_2696 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2725, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2697 = torch.constant.int 26
    %2726 = torch.prims.convert_element_type %2725, %int26_2697 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2726, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2698 = torch.constant.int -2
    %int-1_2699 = torch.constant.int -1
    %2727 = torch.aten.transpose.int %166, %int-2_2698, %int-1_2699 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2700 = torch.constant.int 4096
    %2728 = torch.prim.ListConstruct %566, %int4096_2700 : (!torch.int, !torch.int) -> !torch.list<int>
    %2729 = torch.aten.view %2726, %2728 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2729, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2730 = torch.aten.mm %2729, %2727 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2730, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2701 = torch.constant.int 1
    %int14336_2702 = torch.constant.int 14336
    %2731 = torch.prim.ListConstruct %int1_2701, %566, %int14336_2702 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2732 = torch.aten.view %2730, %2731 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2732, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2703 = torch.constant.int 15
    %2733 = torch.prims.convert_element_type %2732, %int15_2703 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2733, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2734 = torch.aten.silu %2733 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2734, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2735 = torch.aten.div.Tensor %2723, %167 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2735, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2704 = torch.constant.float -2.400000e+02
    %float2.400000e02_2705 = torch.constant.float 2.400000e+02
    %2736 = torch.aten.clamp %2735, %float-2.400000e02_2704, %float2.400000e02_2705 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2736, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2706 = torch.constant.int 26
    %2737 = torch.prims.convert_element_type %2736, %int26_2706 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2737, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2707 = torch.constant.int -2
    %int-1_2708 = torch.constant.int -1
    %2738 = torch.aten.transpose.int %168, %int-2_2707, %int-1_2708 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2709 = torch.constant.int 4096
    %2739 = torch.prim.ListConstruct %566, %int4096_2709 : (!torch.int, !torch.int) -> !torch.list<int>
    %2740 = torch.aten.view %2737, %2739 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2740, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2741 = torch.aten.mm %2740, %2738 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2741, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2710 = torch.constant.int 1
    %int14336_2711 = torch.constant.int 14336
    %2742 = torch.prim.ListConstruct %int1_2710, %566, %int14336_2711 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2743 = torch.aten.view %2741, %2742 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2743, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2712 = torch.constant.int 15
    %2744 = torch.prims.convert_element_type %2743, %int15_2712 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2744, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2745 = torch.aten.mul.Tensor %2734, %2744 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2745, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2746 = torch.aten.div.Tensor %2745, %169 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2746, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_2713 = torch.constant.float -2.400000e+02
    %float2.400000e02_2714 = torch.constant.float 2.400000e+02
    %2747 = torch.aten.clamp %2746, %float-2.400000e02_2713, %float2.400000e02_2714 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2747, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_2715 = torch.constant.int 26
    %2748 = torch.prims.convert_element_type %2747, %int26_2715 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2748, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_2716 = torch.constant.int -2
    %int-1_2717 = torch.constant.int -1
    %2749 = torch.aten.transpose.int %170, %int-2_2716, %int-1_2717 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_2718 = torch.constant.int 1
    %2750 = torch.aten.size.int %2732, %int1_2718 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_2719 = torch.constant.int 14336
    %2751 = torch.prim.ListConstruct %2750, %int14336_2719 : (!torch.int, !torch.int) -> !torch.list<int>
    %2752 = torch.aten.view %2748, %2751 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2752, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2753 = torch.aten.mm %2752, %2749 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2753, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2720 = torch.constant.int 1
    %int4096_2721 = torch.constant.int 4096
    %2754 = torch.prim.ListConstruct %int1_2720, %2750, %int4096_2721 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2755 = torch.aten.view %2753, %2754 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2755, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2722 = torch.constant.int 15
    %2756 = torch.prims.convert_element_type %2755, %int15_2722 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2756, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2723 = torch.constant.int 1
    %2757 = torch.aten.add.Tensor %2716, %2756, %int1_2723 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2757, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2724 = torch.constant.int 2
    %2758 = torch.aten.pow.Tensor_Scalar %2757, %int2_2724 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2758, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2725 = torch.constant.int -1
    %2759 = torch.prim.ListConstruct %int-1_2725 : (!torch.int) -> !torch.list<int>
    %true_2726 = torch.constant.bool true
    %none_2727 = torch.constant.none
    %2760 = torch.aten.mean.dim %2758, %2759, %true_2726, %none_2727 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2760, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2728 = torch.constant.float 1.000000e-05
    %int1_2729 = torch.constant.int 1
    %2761 = torch.aten.add.Scalar %2760, %float1.000000e-05_2728, %int1_2729 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2761, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2762 = torch.aten.rsqrt %2761 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2762, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2763 = torch.aten.mul.Tensor %2757, %2762 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2763, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2764 = torch.aten.mul.Tensor %171, %2763 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2764, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2765 = torch.aten.div.Tensor %2764, %172 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2765, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2730 = torch.constant.float -2.400000e+02
    %float2.400000e02_2731 = torch.constant.float 2.400000e+02
    %2766 = torch.aten.clamp %2765, %float-2.400000e02_2730, %float2.400000e02_2731 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2766, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2732 = torch.constant.int 26
    %2767 = torch.prims.convert_element_type %2766, %int26_2732 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2767, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2733 = torch.constant.int -2
    %int-1_2734 = torch.constant.int -1
    %2768 = torch.aten.transpose.int %173, %int-2_2733, %int-1_2734 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2735 = torch.constant.int 4096
    %2769 = torch.prim.ListConstruct %566, %int4096_2735 : (!torch.int, !torch.int) -> !torch.list<int>
    %2770 = torch.aten.view %2767, %2769 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2770, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2771 = torch.aten.mm %2770, %2768 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2771, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2736 = torch.constant.int 1
    %int4096_2737 = torch.constant.int 4096
    %2772 = torch.prim.ListConstruct %int1_2736, %566, %int4096_2737 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2773 = torch.aten.view %2771, %2772 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2773, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2738 = torch.constant.int 15
    %2774 = torch.prims.convert_element_type %2773, %int15_2738 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2774, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2775 = torch.aten.div.Tensor %2764, %174 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2775, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2739 = torch.constant.float -2.400000e+02
    %float2.400000e02_2740 = torch.constant.float 2.400000e+02
    %2776 = torch.aten.clamp %2775, %float-2.400000e02_2739, %float2.400000e02_2740 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2776, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2741 = torch.constant.int 26
    %2777 = torch.prims.convert_element_type %2776, %int26_2741 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2777, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2742 = torch.constant.int -2
    %int-1_2743 = torch.constant.int -1
    %2778 = torch.aten.transpose.int %175, %int-2_2742, %int-1_2743 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2744 = torch.constant.int 4096
    %2779 = torch.prim.ListConstruct %566, %int4096_2744 : (!torch.int, !torch.int) -> !torch.list<int>
    %2780 = torch.aten.view %2777, %2779 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2780, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2781 = torch.aten.mm %2780, %2778 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2781, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2745 = torch.constant.int 1
    %int1024_2746 = torch.constant.int 1024
    %2782 = torch.prim.ListConstruct %int1_2745, %566, %int1024_2746 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2783 = torch.aten.view %2781, %2782 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2783, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2747 = torch.constant.int 15
    %2784 = torch.prims.convert_element_type %2783, %int15_2747 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2784, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %2785 = torch.aten.div.Tensor %2764, %176 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2785, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2748 = torch.constant.float -2.400000e+02
    %float2.400000e02_2749 = torch.constant.float 2.400000e+02
    %2786 = torch.aten.clamp %2785, %float-2.400000e02_2748, %float2.400000e02_2749 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2786, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2750 = torch.constant.int 26
    %2787 = torch.prims.convert_element_type %2786, %int26_2750 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2787, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2751 = torch.constant.int -2
    %int-1_2752 = torch.constant.int -1
    %2788 = torch.aten.transpose.int %177, %int-2_2751, %int-1_2752 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2753 = torch.constant.int 4096
    %2789 = torch.prim.ListConstruct %566, %int4096_2753 : (!torch.int, !torch.int) -> !torch.list<int>
    %2790 = torch.aten.view %2787, %2789 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2790, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2791 = torch.aten.mm %2790, %2788 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2791, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2754 = torch.constant.int 1
    %int1024_2755 = torch.constant.int 1024
    %2792 = torch.prim.ListConstruct %int1_2754, %566, %int1024_2755 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2793 = torch.aten.view %2791, %2792 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2793, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2756 = torch.constant.int 15
    %2794 = torch.prims.convert_element_type %2793, %int15_2756 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2794, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_2757 = torch.constant.int 1
    %int32_2758 = torch.constant.int 32
    %int128_2759 = torch.constant.int 128
    %2795 = torch.prim.ListConstruct %int1_2757, %566, %int32_2758, %int128_2759 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2796 = torch.aten.view %2774, %2795 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2796, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2760 = torch.constant.int 1
    %int8_2761 = torch.constant.int 8
    %int128_2762 = torch.constant.int 128
    %2797 = torch.prim.ListConstruct %int1_2760, %566, %int8_2761, %int128_2762 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2798 = torch.aten.view %2784, %2797 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2798, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_2763 = torch.constant.int 1
    %int8_2764 = torch.constant.int 8
    %int128_2765 = torch.constant.int 128
    %2799 = torch.prim.ListConstruct %int1_2763, %566, %int8_2764, %int128_2765 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2800 = torch.aten.view %2794, %2799 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2800, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_2766 = torch.constant.int 131072
    %none_2767 = torch.constant.none
    %none_2768 = torch.constant.none
    %cpu_2769 = torch.constant.device "cpu"
    %false_2770 = torch.constant.bool false
    %2801 = torch.aten.arange %int131072_2766, %none_2767, %none_2768, %cpu_2769, %false_2770 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2771 = torch.constant.int 0
    %int128_2772 = torch.constant.int 128
    %none_2773 = torch.constant.none
    %none_2774 = torch.constant.none
    %cpu_2775 = torch.constant.device "cpu"
    %false_2776 = torch.constant.bool false
    %2802 = torch.aten.arange.start %int0_2771, %int128_2772, %none_2773, %none_2774, %cpu_2775, %false_2776 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_2777 = torch.constant.int 2
    %2803 = torch.aten.floor_divide.Scalar %2802, %int2_2777 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_2778 = torch.constant.int 6
    %2804 = torch.prims.convert_element_type %2803, %int6_2778 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_2779 = torch.constant.int 128
    %2805 = torch.aten.div.Scalar %2804, %int128_2779 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_2780 = torch.constant.float 2.000000e+00
    %2806 = torch.aten.mul.Scalar %2805, %float2.000000e00_2780 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_2781 = torch.constant.float 5.000000e+05
    %2807 = torch.aten.pow.Scalar %float5.000000e05_2781, %2806 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2808 = torch.aten.reciprocal %2807 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_2782 = torch.constant.float 1.000000e+00
    %2809 = torch.aten.mul.Scalar %2808, %float1.000000e00_2782 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_2783 = torch.constant.int 131072
    %int1_2784 = torch.constant.int 1
    %2810 = torch.prim.ListConstruct %int131072_2783, %int1_2784 : (!torch.int, !torch.int) -> !torch.list<int>
    %2811 = torch.aten.view %2801, %2810 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2812 = torch.aten.mul.Tensor %2811, %2809 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_2785 = torch.constant.int 1
    %2813 = torch.aten.size.int %2773, %int1_2785 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2786 = torch.constant.int 0
    %2814 = torch.aten.add.int %int0_2786, %2813 : !torch.int, !torch.int -> !torch.int
    %int0_2787 = torch.constant.int 0
    %int0_2788 = torch.constant.int 0
    %int1_2789 = torch.constant.int 1
    %2815 = torch.aten.slice.Tensor %2812, %int0_2787, %int0_2788, %2814, %int1_2789 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2815, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2790 = torch.constant.int 1
    %int0_2791 = torch.constant.int 0
    %int9223372036854775807_2792 = torch.constant.int 9223372036854775807
    %int1_2793 = torch.constant.int 1
    %2816 = torch.aten.slice.Tensor %2815, %int1_2790, %int0_2791, %int9223372036854775807_2792, %int1_2793 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2816, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2794 = torch.constant.int 1
    %int0_2795 = torch.constant.int 0
    %int9223372036854775807_2796 = torch.constant.int 9223372036854775807
    %int1_2797 = torch.constant.int 1
    %2817 = torch.aten.slice.Tensor %2816, %int1_2794, %int0_2795, %int9223372036854775807_2796, %int1_2797 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2817, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_2798 = torch.constant.int 0
    %2818 = torch.aten.unsqueeze %2817, %int0_2798 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2818, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2799 = torch.constant.int 1
    %int0_2800 = torch.constant.int 0
    %int9223372036854775807_2801 = torch.constant.int 9223372036854775807
    %int1_2802 = torch.constant.int 1
    %2819 = torch.aten.slice.Tensor %2818, %int1_2799, %int0_2800, %int9223372036854775807_2801, %int1_2802 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2819, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_2803 = torch.constant.int 2
    %int0_2804 = torch.constant.int 0
    %int9223372036854775807_2805 = torch.constant.int 9223372036854775807
    %int1_2806 = torch.constant.int 1
    %2820 = torch.aten.slice.Tensor %2819, %int2_2803, %int0_2804, %int9223372036854775807_2805, %int1_2806 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2820, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2807 = torch.constant.int 1
    %int1_2808 = torch.constant.int 1
    %int1_2809 = torch.constant.int 1
    %2821 = torch.prim.ListConstruct %int1_2807, %int1_2808, %int1_2809 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2822 = torch.aten.repeat %2820, %2821 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2822, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_2810 = torch.constant.int 6
    %2823 = torch.prims.convert_element_type %2796, %int6_2810 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2823, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2824 = torch_c.to_builtin_tensor %2823 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %2825 = torch_c.to_builtin_tensor %2822 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2826 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2824, %2825) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %2827 = torch_c.from_builtin_tensor %2826 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2827, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2811 = torch.constant.int 15
    %2828 = torch.prims.convert_element_type %2827, %int15_2811 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2828, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_2812 = torch.constant.int 131072
    %none_2813 = torch.constant.none
    %none_2814 = torch.constant.none
    %cpu_2815 = torch.constant.device "cpu"
    %false_2816 = torch.constant.bool false
    %2829 = torch.aten.arange %int131072_2812, %none_2813, %none_2814, %cpu_2815, %false_2816 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2817 = torch.constant.int 0
    %int128_2818 = torch.constant.int 128
    %none_2819 = torch.constant.none
    %none_2820 = torch.constant.none
    %cpu_2821 = torch.constant.device "cpu"
    %false_2822 = torch.constant.bool false
    %2830 = torch.aten.arange.start %int0_2817, %int128_2818, %none_2819, %none_2820, %cpu_2821, %false_2822 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_2823 = torch.constant.int 2
    %2831 = torch.aten.floor_divide.Scalar %2830, %int2_2823 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_2824 = torch.constant.int 6
    %2832 = torch.prims.convert_element_type %2831, %int6_2824 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_2825 = torch.constant.int 128
    %2833 = torch.aten.div.Scalar %2832, %int128_2825 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_2826 = torch.constant.float 2.000000e+00
    %2834 = torch.aten.mul.Scalar %2833, %float2.000000e00_2826 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_2827 = torch.constant.float 5.000000e+05
    %2835 = torch.aten.pow.Scalar %float5.000000e05_2827, %2834 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %2836 = torch.aten.reciprocal %2835 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_2828 = torch.constant.float 1.000000e+00
    %2837 = torch.aten.mul.Scalar %2836, %float1.000000e00_2828 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_2829 = torch.constant.int 131072
    %int1_2830 = torch.constant.int 1
    %2838 = torch.prim.ListConstruct %int131072_2829, %int1_2830 : (!torch.int, !torch.int) -> !torch.list<int>
    %2839 = torch.aten.view %2829, %2838 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %2840 = torch.aten.mul.Tensor %2839, %2837 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_2831 = torch.constant.int 1
    %2841 = torch.aten.size.int %2783, %int1_2831 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2832 = torch.constant.int 0
    %2842 = torch.aten.add.int %int0_2832, %2841 : !torch.int, !torch.int -> !torch.int
    %int0_2833 = torch.constant.int 0
    %int0_2834 = torch.constant.int 0
    %int1_2835 = torch.constant.int 1
    %2843 = torch.aten.slice.Tensor %2840, %int0_2833, %int0_2834, %2842, %int1_2835 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2843, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2836 = torch.constant.int 1
    %int0_2837 = torch.constant.int 0
    %int9223372036854775807_2838 = torch.constant.int 9223372036854775807
    %int1_2839 = torch.constant.int 1
    %2844 = torch.aten.slice.Tensor %2843, %int1_2836, %int0_2837, %int9223372036854775807_2838, %int1_2839 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2844, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_2840 = torch.constant.int 1
    %int0_2841 = torch.constant.int 0
    %int9223372036854775807_2842 = torch.constant.int 9223372036854775807
    %int1_2843 = torch.constant.int 1
    %2845 = torch.aten.slice.Tensor %2844, %int1_2840, %int0_2841, %int9223372036854775807_2842, %int1_2843 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %2845, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_2844 = torch.constant.int 0
    %2846 = torch.aten.unsqueeze %2845, %int0_2844 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2846, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2845 = torch.constant.int 1
    %int0_2846 = torch.constant.int 0
    %int9223372036854775807_2847 = torch.constant.int 9223372036854775807
    %int1_2848 = torch.constant.int 1
    %2847 = torch.aten.slice.Tensor %2846, %int1_2845, %int0_2846, %int9223372036854775807_2847, %int1_2848 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2847, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_2849 = torch.constant.int 2
    %int0_2850 = torch.constant.int 0
    %int9223372036854775807_2851 = torch.constant.int 9223372036854775807
    %int1_2852 = torch.constant.int 1
    %2848 = torch.aten.slice.Tensor %2847, %int2_2849, %int0_2850, %int9223372036854775807_2851, %int1_2852 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2848, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_2853 = torch.constant.int 1
    %int1_2854 = torch.constant.int 1
    %int1_2855 = torch.constant.int 1
    %2849 = torch.prim.ListConstruct %int1_2853, %int1_2854, %int1_2855 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2850 = torch.aten.repeat %2848, %2849 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %2850, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_2856 = torch.constant.int 6
    %2851 = torch.prims.convert_element_type %2798, %int6_2856 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2851, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %2852 = torch_c.to_builtin_tensor %2851 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %2853 = torch_c.to_builtin_tensor %2850 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %2854 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2852, %2853) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %2855 = torch_c.from_builtin_tensor %2854 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %2855, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_2857 = torch.constant.int 15
    %2856 = torch.prims.convert_element_type %2855, %int15_2857 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2856, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %2857 = torch.aten.div.Tensor %2856, %178 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2857, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2858 = torch.constant.float -2.400000e+02
    %float2.400000e02_2859 = torch.constant.float 2.400000e+02
    %2858 = torch.aten.clamp %2857, %float-2.400000e02_2858, %float2.400000e02_2859 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2858, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2860 = torch.constant.int 26
    %2859 = torch.prims.convert_element_type %2858, %int26_2860 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2859, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %2860 = torch.aten.div.Tensor %2800, %178 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2860, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2861 = torch.constant.float -2.400000e+02
    %float2.400000e02_2862 = torch.constant.float 2.400000e+02
    %2861 = torch.aten.clamp %2860, %float-2.400000e02_2861, %float2.400000e02_2862 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2861, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2863 = torch.constant.int 26
    %2862 = torch.prims.convert_element_type %2861, %int26_2863 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2862, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_2864 = torch.constant.int 64
    %2863 = torch.aten.mul.Scalar %arg2, %int64_2864 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2863, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int20 = torch.constant.int 20
    %int1_2865 = torch.constant.int 1
    %2864 = torch.aten.add.Scalar %2863, %int20, %int1_2865 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2864, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_2866 = torch.constant.int 1
    %int32_2867 = torch.constant.int 32
    %int8_2868 = torch.constant.int 8
    %int128_2869 = torch.constant.int 128
    %2865 = torch.prim.ListConstruct %int1_2866, %670, %int32_2867, %int8_2868, %int128_2869 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2866 = torch.aten.view %2859, %2865 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2866, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2870 = torch.constant.int 32
    %int8_2871 = torch.constant.int 8
    %int128_2872 = torch.constant.int 128
    %2867 = torch.prim.ListConstruct %670, %int32_2870, %int8_2871, %int128_2872 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2868 = torch.aten.view %2866, %2867 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2868, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2869 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2870 = torch.aten.view %2864, %2869 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2870, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_2873 = torch.constant.int 32
    %int2_2874 = torch.constant.int 2
    %int32_2875 = torch.constant.int 32
    %int8_2876 = torch.constant.int 8
    %int128_2877 = torch.constant.int 128
    %2871 = torch.prim.ListConstruct %661, %int32_2873, %int2_2874, %int32_2875, %int8_2876, %int128_2877 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2872 = torch.aten.view %2679, %2871 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2872, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2878 = torch.constant.int 32
    %2873 = torch.aten.mul.int %661, %int32_2878 : !torch.int, !torch.int -> !torch.int
    %int2_2879 = torch.constant.int 2
    %2874 = torch.aten.mul.int %2873, %int2_2879 : !torch.int, !torch.int -> !torch.int
    %int32_2880 = torch.constant.int 32
    %int8_2881 = torch.constant.int 8
    %int128_2882 = torch.constant.int 128
    %2875 = torch.prim.ListConstruct %2874, %int32_2880, %int8_2881, %int128_2882 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2876 = torch.aten.view %2872, %2875 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2876, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %2877 = torch.prim.ListConstruct %2870 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2883 = torch.constant.bool false
    %2878 = torch.aten.index_put %2876, %2877, %2868, %false_2883 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2878, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2884 = torch.constant.int 32
    %int2_2885 = torch.constant.int 2
    %int32_2886 = torch.constant.int 32
    %int8_2887 = torch.constant.int 8
    %int128_2888 = torch.constant.int 128
    %2879 = torch.prim.ListConstruct %661, %int32_2884, %int2_2885, %int32_2886, %int8_2887, %int128_2888 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2880 = torch.aten.view %2878, %2879 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2880, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2889 = torch.constant.int 2097152
    %2881 = torch.prim.ListConstruct %661, %int2097152_2889 : (!torch.int, !torch.int) -> !torch.list<int>
    %2882 = torch.aten.view %2880, %2881 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2882, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_2890 = torch.constant.int 32
    %int2_2891 = torch.constant.int 2
    %int32_2892 = torch.constant.int 32
    %int8_2893 = torch.constant.int 8
    %int128_2894 = torch.constant.int 128
    %2883 = torch.prim.ListConstruct %661, %int32_2890, %int2_2891, %int32_2892, %int8_2893, %int128_2894 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2884 = torch.aten.view %2882, %2883 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2884, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_2895 = torch.constant.int 32
    %int8_2896 = torch.constant.int 8
    %int128_2897 = torch.constant.int 128
    %2885 = torch.prim.ListConstruct %2874, %int32_2895, %int8_2896, %int128_2897 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2886 = torch.aten.view %2884, %2885 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2886, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_2898 = torch.constant.int 1
    %int32_2899 = torch.constant.int 32
    %int8_2900 = torch.constant.int 8
    %int128_2901 = torch.constant.int 128
    %2887 = torch.prim.ListConstruct %int1_2898, %670, %int32_2899, %int8_2900, %int128_2901 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2888 = torch.aten.view %2862, %2887 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2888, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2902 = torch.constant.int 32
    %int8_2903 = torch.constant.int 8
    %int128_2904 = torch.constant.int 128
    %2889 = torch.prim.ListConstruct %670, %int32_2902, %int8_2903, %int128_2904 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2890 = torch.aten.view %2888, %2889 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2890, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2905 = torch.constant.int 1
    %int1_2906 = torch.constant.int 1
    %2891 = torch.aten.add.Scalar %2864, %int1_2905, %int1_2906 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2891, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2892 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %2893 = torch.aten.view %2891, %2892 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2893, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2894 = torch.prim.ListConstruct %2893 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2907 = torch.constant.bool false
    %2895 = torch.aten.index_put %2886, %2894, %2890, %false_2907 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %2895, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_2908 = torch.constant.int 32
    %int2_2909 = torch.constant.int 2
    %int32_2910 = torch.constant.int 32
    %int8_2911 = torch.constant.int 8
    %int128_2912 = torch.constant.int 128
    %2896 = torch.prim.ListConstruct %661, %int32_2908, %int2_2909, %int32_2910, %int8_2911, %int128_2912 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2897 = torch.aten.view %2895, %2896 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %2897, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_2913 = torch.constant.int 2097152
    %2898 = torch.prim.ListConstruct %661, %int2097152_2913 : (!torch.int, !torch.int) -> !torch.list<int>
    %2899 = torch.aten.view %2897, %2898 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %2899, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_2914 = torch.constant.int -2
    %2900 = torch.aten.unsqueeze %2859, %int-2_2914 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2900, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2915 = torch.constant.int 1
    %int8_2916 = torch.constant.int 8
    %int4_2917 = torch.constant.int 4
    %int128_2918 = torch.constant.int 128
    %2901 = torch.prim.ListConstruct %int1_2915, %2841, %int8_2916, %int4_2917, %int128_2918 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2919 = torch.constant.bool false
    %2902 = torch.aten.expand %2900, %2901, %false_2919 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2902, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2920 = torch.constant.int 0
    %2903 = torch.aten.clone %2902, %int0_2920 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2903, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2921 = torch.constant.int 1
    %int32_2922 = torch.constant.int 32
    %int128_2923 = torch.constant.int 128
    %2904 = torch.prim.ListConstruct %int1_2921, %2841, %int32_2922, %int128_2923 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2905 = torch.aten._unsafe_view %2903, %2904 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2905, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_2924 = torch.constant.int -2
    %2906 = torch.aten.unsqueeze %2862, %int-2_2924 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2906, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2925 = torch.constant.int 1
    %2907 = torch.aten.size.int %2793, %int1_2925 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_2926 = torch.constant.int 1
    %int8_2927 = torch.constant.int 8
    %int4_2928 = torch.constant.int 4
    %int128_2929 = torch.constant.int 128
    %2908 = torch.prim.ListConstruct %int1_2926, %2907, %int8_2927, %int4_2928, %int128_2929 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2930 = torch.constant.bool false
    %2909 = torch.aten.expand %2906, %2908, %false_2930 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2909, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2931 = torch.constant.int 0
    %2910 = torch.aten.clone %2909, %int0_2931 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2910, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2932 = torch.constant.int 1
    %int32_2933 = torch.constant.int 32
    %int128_2934 = torch.constant.int 128
    %2911 = torch.prim.ListConstruct %int1_2932, %2907, %int32_2933, %int128_2934 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2912 = torch.aten._unsafe_view %2910, %2911 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2912, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_2935 = torch.constant.int 6
    %2913 = torch.prims.convert_element_type %2905, %int6_2935 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2913, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2914 = torch.aten.mul.Tensor %2913, %178 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2914, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2936 = torch.constant.int 15
    %2915 = torch.prims.convert_element_type %2914, %int15_2936 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2915, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_2937 = torch.constant.int 6
    %2916 = torch.prims.convert_element_type %2912, %int6_2937 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2916, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2917 = torch.aten.mul.Tensor %2916, %178 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2917, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2938 = torch.constant.int 15
    %2918 = torch.prims.convert_element_type %2917, %int15_2938 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2918, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2939 = torch.constant.int 1
    %int2_2940 = torch.constant.int 2
    %2919 = torch.aten.transpose.int %2828, %int1_2939, %int2_2940 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2919, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2941 = torch.constant.int 1
    %int2_2942 = torch.constant.int 2
    %2920 = torch.aten.transpose.int %2915, %int1_2941, %int2_2942 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2920, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2943 = torch.constant.int 1
    %int2_2944 = torch.constant.int 2
    %2921 = torch.aten.transpose.int %2918, %int1_2943, %int2_2944 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2921, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_2945 = torch.constant.float 0.000000e+00
    %true_2946 = torch.constant.bool true
    %none_2947 = torch.constant.none
    %none_2948 = torch.constant.none
    %2922:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2919, %2920, %2921, %float0.000000e00_2945, %true_2946, %none_2947, %none_2948) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2922#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2949 = torch.constant.int 1
    %int2_2950 = torch.constant.int 2
    %2923 = torch.aten.transpose.int %2922#0, %int1_2949, %int2_2950 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2923, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2951 = torch.constant.int 1
    %int4096_2952 = torch.constant.int 4096
    %2924 = torch.prim.ListConstruct %int1_2951, %2813, %int4096_2952 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2925 = torch.aten.view %2923, %2924 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2925, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2926 = torch.aten.div.Tensor %2925, %179 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2926, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2953 = torch.constant.float -2.400000e+02
    %float2.400000e02_2954 = torch.constant.float 2.400000e+02
    %2927 = torch.aten.clamp %2926, %float-2.400000e02_2953, %float2.400000e02_2954 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2927, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2955 = torch.constant.int 26
    %2928 = torch.prims.convert_element_type %2927, %int26_2955 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2928, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2956 = torch.constant.int -2
    %int-1_2957 = torch.constant.int -1
    %2929 = torch.aten.transpose.int %180, %int-2_2956, %int-1_2957 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2958 = torch.constant.int 4096
    %2930 = torch.prim.ListConstruct %2813, %int4096_2958 : (!torch.int, !torch.int) -> !torch.list<int>
    %2931 = torch.aten.view %2928, %2930 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2931, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2932 = torch.aten.mm %2931, %2929 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2932, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2959 = torch.constant.int 1
    %int4096_2960 = torch.constant.int 4096
    %2933 = torch.prim.ListConstruct %int1_2959, %2813, %int4096_2960 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2934 = torch.aten.view %2932, %2933 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2934, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2961 = torch.constant.int 15
    %2935 = torch.prims.convert_element_type %2934, %int15_2961 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2935, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2962 = torch.constant.int 1
    %2936 = torch.aten.add.Tensor %2757, %2935, %int1_2962 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2936, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2963 = torch.constant.int 2
    %2937 = torch.aten.pow.Tensor_Scalar %2936, %int2_2963 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2937, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2964 = torch.constant.int -1
    %2938 = torch.prim.ListConstruct %int-1_2964 : (!torch.int) -> !torch.list<int>
    %true_2965 = torch.constant.bool true
    %none_2966 = torch.constant.none
    %2939 = torch.aten.mean.dim %2937, %2938, %true_2965, %none_2966 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2939, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2967 = torch.constant.float 1.000000e-05
    %int1_2968 = torch.constant.int 1
    %2940 = torch.aten.add.Scalar %2939, %float1.000000e-05_2967, %int1_2968 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2940, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2941 = torch.aten.rsqrt %2940 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2941, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2942 = torch.aten.mul.Tensor %2936, %2941 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2942, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2943 = torch.aten.mul.Tensor %181, %2942 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2943, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2944 = torch.aten.div.Tensor %2943, %182 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2944, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2969 = torch.constant.float -2.400000e+02
    %float2.400000e02_2970 = torch.constant.float 2.400000e+02
    %2945 = torch.aten.clamp %2944, %float-2.400000e02_2969, %float2.400000e02_2970 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2945, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2971 = torch.constant.int 26
    %2946 = torch.prims.convert_element_type %2945, %int26_2971 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2946, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2972 = torch.constant.int -2
    %int-1_2973 = torch.constant.int -1
    %2947 = torch.aten.transpose.int %183, %int-2_2972, %int-1_2973 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2974 = torch.constant.int 4096
    %2948 = torch.prim.ListConstruct %566, %int4096_2974 : (!torch.int, !torch.int) -> !torch.list<int>
    %2949 = torch.aten.view %2946, %2948 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2949, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2950 = torch.aten.mm %2949, %2947 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2950, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2975 = torch.constant.int 1
    %int14336_2976 = torch.constant.int 14336
    %2951 = torch.prim.ListConstruct %int1_2975, %566, %int14336_2976 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2952 = torch.aten.view %2950, %2951 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2952, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2977 = torch.constant.int 15
    %2953 = torch.prims.convert_element_type %2952, %int15_2977 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2953, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2954 = torch.aten.silu %2953 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2954, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2955 = torch.aten.div.Tensor %2943, %184 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2955, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_2978 = torch.constant.float -2.400000e+02
    %float2.400000e02_2979 = torch.constant.float 2.400000e+02
    %2956 = torch.aten.clamp %2955, %float-2.400000e02_2978, %float2.400000e02_2979 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2956, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_2980 = torch.constant.int 26
    %2957 = torch.prims.convert_element_type %2956, %int26_2980 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2957, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2981 = torch.constant.int -2
    %int-1_2982 = torch.constant.int -1
    %2958 = torch.aten.transpose.int %185, %int-2_2981, %int-1_2982 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2983 = torch.constant.int 4096
    %2959 = torch.prim.ListConstruct %566, %int4096_2983 : (!torch.int, !torch.int) -> !torch.list<int>
    %2960 = torch.aten.view %2957, %2959 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2960, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2961 = torch.aten.mm %2960, %2958 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2961, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2984 = torch.constant.int 1
    %int14336_2985 = torch.constant.int 14336
    %2962 = torch.prim.ListConstruct %int1_2984, %566, %int14336_2985 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2963 = torch.aten.view %2961, %2962 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2963, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2986 = torch.constant.int 15
    %2964 = torch.prims.convert_element_type %2963, %int15_2986 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2964, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2965 = torch.aten.mul.Tensor %2954, %2964 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2965, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2966 = torch.aten.div.Tensor %2965, %186 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2966, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_2987 = torch.constant.float -2.400000e+02
    %float2.400000e02_2988 = torch.constant.float 2.400000e+02
    %2967 = torch.aten.clamp %2966, %float-2.400000e02_2987, %float2.400000e02_2988 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2967, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_2989 = torch.constant.int 26
    %2968 = torch.prims.convert_element_type %2967, %int26_2989 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2968, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_2990 = torch.constant.int -2
    %int-1_2991 = torch.constant.int -1
    %2969 = torch.aten.transpose.int %187, %int-2_2990, %int-1_2991 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_2992 = torch.constant.int 1
    %2970 = torch.aten.size.int %2952, %int1_2992 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_2993 = torch.constant.int 14336
    %2971 = torch.prim.ListConstruct %2970, %int14336_2993 : (!torch.int, !torch.int) -> !torch.list<int>
    %2972 = torch.aten.view %2968, %2971 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2972, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2973 = torch.aten.mm %2972, %2969 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2973, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2994 = torch.constant.int 1
    %int4096_2995 = torch.constant.int 4096
    %2974 = torch.prim.ListConstruct %int1_2994, %2970, %int4096_2995 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2975 = torch.aten.view %2973, %2974 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2975, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2996 = torch.constant.int 15
    %2976 = torch.prims.convert_element_type %2975, %int15_2996 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2976, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2997 = torch.constant.int 1
    %2977 = torch.aten.add.Tensor %2936, %2976, %int1_2997 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2977, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2998 = torch.constant.int 2
    %2978 = torch.aten.pow.Tensor_Scalar %2977, %int2_2998 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2978, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2999 = torch.constant.int -1
    %2979 = torch.prim.ListConstruct %int-1_2999 : (!torch.int) -> !torch.list<int>
    %true_3000 = torch.constant.bool true
    %none_3001 = torch.constant.none
    %2980 = torch.aten.mean.dim %2978, %2979, %true_3000, %none_3001 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2980, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_3002 = torch.constant.float 1.000000e-05
    %int1_3003 = torch.constant.int 1
    %2981 = torch.aten.add.Scalar %2980, %float1.000000e-05_3002, %int1_3003 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2981, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2982 = torch.aten.rsqrt %2981 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2982, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2983 = torch.aten.mul.Tensor %2977, %2982 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2983, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2984 = torch.aten.mul.Tensor %188, %2983 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2984, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %2985 = torch.aten.div.Tensor %2984, %189 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2985, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_3004 = torch.constant.float -2.400000e+02
    %float2.400000e02_3005 = torch.constant.float 2.400000e+02
    %2986 = torch.aten.clamp %2985, %float-2.400000e02_3004, %float2.400000e02_3005 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2986, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_3006 = torch.constant.int 26
    %2987 = torch.prims.convert_element_type %2986, %int26_3006 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2987, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3007 = torch.constant.int -2
    %int-1_3008 = torch.constant.int -1
    %2988 = torch.aten.transpose.int %190, %int-2_3007, %int-1_3008 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_3009 = torch.constant.int 4096
    %2989 = torch.prim.ListConstruct %566, %int4096_3009 : (!torch.int, !torch.int) -> !torch.list<int>
    %2990 = torch.aten.view %2987, %2989 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2990, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2991 = torch.aten.mm %2990, %2988 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2991, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_3010 = torch.constant.int 1
    %int4096_3011 = torch.constant.int 4096
    %2992 = torch.prim.ListConstruct %int1_3010, %566, %int4096_3011 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2993 = torch.aten.view %2991, %2992 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2993, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_3012 = torch.constant.int 15
    %2994 = torch.prims.convert_element_type %2993, %int15_3012 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2994, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2995 = torch.aten.div.Tensor %2984, %191 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2995, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_3013 = torch.constant.float -2.400000e+02
    %float2.400000e02_3014 = torch.constant.float 2.400000e+02
    %2996 = torch.aten.clamp %2995, %float-2.400000e02_3013, %float2.400000e02_3014 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2996, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_3015 = torch.constant.int 26
    %2997 = torch.prims.convert_element_type %2996, %int26_3015 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2997, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3016 = torch.constant.int -2
    %int-1_3017 = torch.constant.int -1
    %2998 = torch.aten.transpose.int %192, %int-2_3016, %int-1_3017 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_3018 = torch.constant.int 4096
    %2999 = torch.prim.ListConstruct %566, %int4096_3018 : (!torch.int, !torch.int) -> !torch.list<int>
    %3000 = torch.aten.view %2997, %2999 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3000, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %3001 = torch.aten.mm %3000, %2998 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3001, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_3019 = torch.constant.int 1
    %int1024_3020 = torch.constant.int 1024
    %3002 = torch.prim.ListConstruct %int1_3019, %566, %int1024_3020 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3003 = torch.aten.view %3001, %3002 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3003, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_3021 = torch.constant.int 15
    %3004 = torch.prims.convert_element_type %3003, %int15_3021 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %3004, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %3005 = torch.aten.div.Tensor %2984, %193 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %3005, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %float-2.400000e02_3022 = torch.constant.float -2.400000e+02
    %float2.400000e02_3023 = torch.constant.float 2.400000e+02
    %3006 = torch.aten.clamp %3005, %float-2.400000e02_3022, %float2.400000e02_3023 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %3006, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int26_3024 = torch.constant.int 26
    %3007 = torch.prims.convert_element_type %3006, %int26_3024 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3007, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3025 = torch.constant.int -2
    %int-1_3026 = torch.constant.int -1
    %3008 = torch.aten.transpose.int %194, %int-2_3025, %int-1_3026 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_3027 = torch.constant.int 4096
    %3009 = torch.prim.ListConstruct %566, %int4096_3027 : (!torch.int, !torch.int) -> !torch.list<int>
    %3010 = torch.aten.view %3007, %3009 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3010, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %3011 = torch.aten.mm %3010, %3008 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3011, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_3028 = torch.constant.int 1
    %int1024_3029 = torch.constant.int 1024
    %3012 = torch.prim.ListConstruct %int1_3028, %566, %int1024_3029 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3013 = torch.aten.view %3011, %3012 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3013, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_3030 = torch.constant.int 15
    %3014 = torch.prims.convert_element_type %3013, %int15_3030 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %3014, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_3031 = torch.constant.int 1
    %int32_3032 = torch.constant.int 32
    %int128_3033 = torch.constant.int 128
    %3015 = torch.prim.ListConstruct %int1_3031, %566, %int32_3032, %int128_3033 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3016 = torch.aten.view %2994, %3015 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %3016, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_3034 = torch.constant.int 1
    %int8_3035 = torch.constant.int 8
    %int128_3036 = torch.constant.int 128
    %3017 = torch.prim.ListConstruct %int1_3034, %566, %int8_3035, %int128_3036 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3018 = torch.aten.view %3004, %3017 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3018, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_3037 = torch.constant.int 1
    %int8_3038 = torch.constant.int 8
    %int128_3039 = torch.constant.int 128
    %3019 = torch.prim.ListConstruct %int1_3037, %566, %int8_3038, %int128_3039 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3020 = torch.aten.view %3014, %3019 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3020, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_3040 = torch.constant.int 131072
    %none_3041 = torch.constant.none
    %none_3042 = torch.constant.none
    %cpu_3043 = torch.constant.device "cpu"
    %false_3044 = torch.constant.bool false
    %3021 = torch.aten.arange %int131072_3040, %none_3041, %none_3042, %cpu_3043, %false_3044 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_3045 = torch.constant.int 0
    %int128_3046 = torch.constant.int 128
    %none_3047 = torch.constant.none
    %none_3048 = torch.constant.none
    %cpu_3049 = torch.constant.device "cpu"
    %false_3050 = torch.constant.bool false
    %3022 = torch.aten.arange.start %int0_3045, %int128_3046, %none_3047, %none_3048, %cpu_3049, %false_3050 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_3051 = torch.constant.int 2
    %3023 = torch.aten.floor_divide.Scalar %3022, %int2_3051 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_3052 = torch.constant.int 6
    %3024 = torch.prims.convert_element_type %3023, %int6_3052 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_3053 = torch.constant.int 128
    %3025 = torch.aten.div.Scalar %3024, %int128_3053 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_3054 = torch.constant.float 2.000000e+00
    %3026 = torch.aten.mul.Scalar %3025, %float2.000000e00_3054 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_3055 = torch.constant.float 5.000000e+05
    %3027 = torch.aten.pow.Scalar %float5.000000e05_3055, %3026 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %3028 = torch.aten.reciprocal %3027 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_3056 = torch.constant.float 1.000000e+00
    %3029 = torch.aten.mul.Scalar %3028, %float1.000000e00_3056 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_3057 = torch.constant.int 131072
    %int1_3058 = torch.constant.int 1
    %3030 = torch.prim.ListConstruct %int131072_3057, %int1_3058 : (!torch.int, !torch.int) -> !torch.list<int>
    %3031 = torch.aten.view %3021, %3030 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %3032 = torch.aten.mul.Tensor %3031, %3029 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_3059 = torch.constant.int 1
    %3033 = torch.aten.size.int %2993, %int1_3059 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_3060 = torch.constant.int 0
    %3034 = torch.aten.add.int %int0_3060, %3033 : !torch.int, !torch.int -> !torch.int
    %int0_3061 = torch.constant.int 0
    %int0_3062 = torch.constant.int 0
    %int1_3063 = torch.constant.int 1
    %3035 = torch.aten.slice.Tensor %3032, %int0_3061, %int0_3062, %3034, %int1_3063 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %3035, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_3064 = torch.constant.int 1
    %int0_3065 = torch.constant.int 0
    %int9223372036854775807_3066 = torch.constant.int 9223372036854775807
    %int1_3067 = torch.constant.int 1
    %3036 = torch.aten.slice.Tensor %3035, %int1_3064, %int0_3065, %int9223372036854775807_3066, %int1_3067 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %3036, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_3068 = torch.constant.int 1
    %int0_3069 = torch.constant.int 0
    %int9223372036854775807_3070 = torch.constant.int 9223372036854775807
    %int1_3071 = torch.constant.int 1
    %3037 = torch.aten.slice.Tensor %3036, %int1_3068, %int0_3069, %int9223372036854775807_3070, %int1_3071 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %3037, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_3072 = torch.constant.int 0
    %3038 = torch.aten.unsqueeze %3037, %int0_3072 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3038, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_3073 = torch.constant.int 1
    %int0_3074 = torch.constant.int 0
    %int9223372036854775807_3075 = torch.constant.int 9223372036854775807
    %int1_3076 = torch.constant.int 1
    %3039 = torch.aten.slice.Tensor %3038, %int1_3073, %int0_3074, %int9223372036854775807_3075, %int1_3076 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3039, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_3077 = torch.constant.int 2
    %int0_3078 = torch.constant.int 0
    %int9223372036854775807_3079 = torch.constant.int 9223372036854775807
    %int1_3080 = torch.constant.int 1
    %3040 = torch.aten.slice.Tensor %3039, %int2_3077, %int0_3078, %int9223372036854775807_3079, %int1_3080 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3040, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_3081 = torch.constant.int 1
    %int1_3082 = torch.constant.int 1
    %int1_3083 = torch.constant.int 1
    %3041 = torch.prim.ListConstruct %int1_3081, %int1_3082, %int1_3083 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3042 = torch.aten.repeat %3040, %3041 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3042, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_3084 = torch.constant.int 6
    %3043 = torch.prims.convert_element_type %3016, %int6_3084 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %3043, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %3044 = torch_c.to_builtin_tensor %3043 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
    %3045 = torch_c.to_builtin_tensor %3042 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %3046 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%3044, %3045) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
    %3047 = torch_c.from_builtin_tensor %3046 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %3047, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_3085 = torch.constant.int 15
    %3048 = torch.prims.convert_element_type %3047, %int15_3085 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %3048, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_3086 = torch.constant.int 131072
    %none_3087 = torch.constant.none
    %none_3088 = torch.constant.none
    %cpu_3089 = torch.constant.device "cpu"
    %false_3090 = torch.constant.bool false
    %3049 = torch.aten.arange %int131072_3086, %none_3087, %none_3088, %cpu_3089, %false_3090 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_3091 = torch.constant.int 0
    %int128_3092 = torch.constant.int 128
    %none_3093 = torch.constant.none
    %none_3094 = torch.constant.none
    %cpu_3095 = torch.constant.device "cpu"
    %false_3096 = torch.constant.bool false
    %3050 = torch.aten.arange.start %int0_3091, %int128_3092, %none_3093, %none_3094, %cpu_3095, %false_3096 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
    %int2_3097 = torch.constant.int 2
    %3051 = torch.aten.floor_divide.Scalar %3050, %int2_3097 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
    %int6_3098 = torch.constant.int 6
    %3052 = torch.prims.convert_element_type %3051, %int6_3098 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
    %int128_3099 = torch.constant.int 128
    %3053 = torch.aten.div.Scalar %3052, %int128_3099 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %float2.000000e00_3100 = torch.constant.float 2.000000e+00
    %3054 = torch.aten.mul.Scalar %3053, %float2.000000e00_3100 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %float5.000000e05_3101 = torch.constant.float 5.000000e+05
    %3055 = torch.aten.pow.Scalar %float5.000000e05_3101, %3054 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %3056 = torch.aten.reciprocal %3055 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %float1.000000e00_3102 = torch.constant.float 1.000000e+00
    %3057 = torch.aten.mul.Scalar %3056, %float1.000000e00_3102 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int131072_3103 = torch.constant.int 131072
    %int1_3104 = torch.constant.int 1
    %3058 = torch.prim.ListConstruct %int131072_3103, %int1_3104 : (!torch.int, !torch.int) -> !torch.list<int>
    %3059 = torch.aten.view %3049, %3058 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
    %3060 = torch.aten.mul.Tensor %3059, %3057 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %int1_3105 = torch.constant.int 1
    %3061 = torch.aten.size.int %3003, %int1_3105 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_3106 = torch.constant.int 0
    %3062 = torch.aten.add.int %int0_3106, %3061 : !torch.int, !torch.int -> !torch.int
    %int0_3107 = torch.constant.int 0
    %int0_3108 = torch.constant.int 0
    %int1_3109 = torch.constant.int 1
    %3063 = torch.aten.slice.Tensor %3060, %int0_3107, %int0_3108, %3062, %int1_3109 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %3063, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_3110 = torch.constant.int 1
    %int0_3111 = torch.constant.int 0
    %int9223372036854775807_3112 = torch.constant.int 9223372036854775807
    %int1_3113 = torch.constant.int 1
    %3064 = torch.aten.slice.Tensor %3063, %int1_3110, %int0_3111, %int9223372036854775807_3112, %int1_3113 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %3064, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int1_3114 = torch.constant.int 1
    %int0_3115 = torch.constant.int 0
    %int9223372036854775807_3116 = torch.constant.int 9223372036854775807
    %int1_3117 = torch.constant.int 1
    %3065 = torch.aten.slice.Tensor %3064, %int1_3114, %int0_3115, %int9223372036854775807_3116, %int1_3117 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
    torch.bind_symbolic_shape %3065, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
    %int0_3118 = torch.constant.int 0
    %3066 = torch.aten.unsqueeze %3065, %int0_3118 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3066, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_3119 = torch.constant.int 1
    %int0_3120 = torch.constant.int 0
    %int9223372036854775807_3121 = torch.constant.int 9223372036854775807
    %int1_3122 = torch.constant.int 1
    %3067 = torch.aten.slice.Tensor %3066, %int1_3119, %int0_3120, %int9223372036854775807_3121, %int1_3122 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3067, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int2_3123 = torch.constant.int 2
    %int0_3124 = torch.constant.int 0
    %int9223372036854775807_3125 = torch.constant.int 9223372036854775807
    %int1_3126 = torch.constant.int 1
    %3068 = torch.aten.slice.Tensor %3067, %int2_3123, %int0_3124, %int9223372036854775807_3125, %int1_3126 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3068, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int1_3127 = torch.constant.int 1
    %int1_3128 = torch.constant.int 1
    %int1_3129 = torch.constant.int 1
    %3069 = torch.prim.ListConstruct %int1_3127, %int1_3128, %int1_3129 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3070 = torch.aten.repeat %3068, %3069 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
    torch.bind_symbolic_shape %3070, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
    %int6_3130 = torch.constant.int 6
    %3071 = torch.prims.convert_element_type %3018, %int6_3130 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %3071, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %3072 = torch_c.to_builtin_tensor %3071 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
    %3073 = torch_c.to_builtin_tensor %3070 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
    %3074 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%3072, %3073) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
    %3075 = torch_c.from_builtin_tensor %3074 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
    torch.bind_symbolic_shape %3075, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
    %int15_3131 = torch.constant.int 15
    %3076 = torch.prims.convert_element_type %3075, %int15_3131 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3076, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %3077 = torch.aten.div.Tensor %3076, %195 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3077, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_3132 = torch.constant.float -2.400000e+02
    %float2.400000e02_3133 = torch.constant.float 2.400000e+02
    %3078 = torch.aten.clamp %3077, %float-2.400000e02_3132, %float2.400000e02_3133 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3078, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_3134 = torch.constant.int 26
    %3079 = torch.prims.convert_element_type %3078, %int26_3134 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3079, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %3080 = torch.aten.div.Tensor %3020, %195 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3080, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_3135 = torch.constant.float -2.400000e+02
    %float2.400000e02_3136 = torch.constant.float 2.400000e+02
    %3081 = torch.aten.clamp %3080, %float-2.400000e02_3135, %float2.400000e02_3136 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3081, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_3137 = torch.constant.int 26
    %3082 = torch.prims.convert_element_type %3081, %int26_3137 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3082, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_3138 = torch.constant.int 64
    %3083 = torch.aten.mul.Scalar %arg2, %int64_3138 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %3083, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int22 = torch.constant.int 22
    %int1_3139 = torch.constant.int 1
    %3084 = torch.aten.add.Scalar %3083, %int22, %int1_3139 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %3084, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_3140 = torch.constant.int 1
    %int32_3141 = torch.constant.int 32
    %int8_3142 = torch.constant.int 8
    %int128_3143 = torch.constant.int 128
    %3085 = torch.prim.ListConstruct %int1_3140, %670, %int32_3141, %int8_3142, %int128_3143 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3086 = torch.aten.view %3079, %3085 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3086, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_3144 = torch.constant.int 32
    %int8_3145 = torch.constant.int 8
    %int128_3146 = torch.constant.int 128
    %3087 = torch.prim.ListConstruct %670, %int32_3144, %int8_3145, %int128_3146 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3088 = torch.aten.view %3086, %3087 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3088, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3089 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %3090 = torch.aten.view %3084, %3089 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %3090, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_3147 = torch.constant.int 32
    %int2_3148 = torch.constant.int 2
    %int32_3149 = torch.constant.int 32
    %int8_3150 = torch.constant.int 8
    %int128_3151 = torch.constant.int 128
    %3091 = torch.prim.ListConstruct %661, %int32_3147, %int2_3148, %int32_3149, %int8_3150, %int128_3151 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3092 = torch.aten.view %2899, %3091 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %3092, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_3152 = torch.constant.int 32
    %3093 = torch.aten.mul.int %661, %int32_3152 : !torch.int, !torch.int -> !torch.int
    %int2_3153 = torch.constant.int 2
    %3094 = torch.aten.mul.int %3093, %int2_3153 : !torch.int, !torch.int -> !torch.int
    %int32_3154 = torch.constant.int 32
    %int8_3155 = torch.constant.int 8
    %int128_3156 = torch.constant.int 128
    %3095 = torch.prim.ListConstruct %3094, %int32_3154, %int8_3155, %int128_3156 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3096 = torch.aten.view %3092, %3095 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %3096, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %3097 = torch.prim.ListConstruct %3090 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_3157 = torch.constant.bool false
    %3098 = torch.aten.index_put %3096, %3097, %3088, %false_3157 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %3098, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_3158 = torch.constant.int 32
    %int2_3159 = torch.constant.int 2
    %int32_3160 = torch.constant.int 32
    %int8_3161 = torch.constant.int 8
    %int128_3162 = torch.constant.int 128
    %3099 = torch.prim.ListConstruct %661, %int32_3158, %int2_3159, %int32_3160, %int8_3161, %int128_3162 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3100 = torch.aten.view %3098, %3099 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %3100, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_3163 = torch.constant.int 2097152
    %3101 = torch.prim.ListConstruct %661, %int2097152_3163 : (!torch.int, !torch.int) -> !torch.list<int>
    %3102 = torch.aten.view %3100, %3101 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %3102, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int32_3164 = torch.constant.int 32
    %int2_3165 = torch.constant.int 2
    %int32_3166 = torch.constant.int 32
    %int8_3167 = torch.constant.int 8
    %int128_3168 = torch.constant.int 128
    %3103 = torch.prim.ListConstruct %661, %int32_3164, %int2_3165, %int32_3166, %int8_3167, %int128_3168 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3104 = torch.aten.view %3102, %3103 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %3104, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int32_3169 = torch.constant.int 32
    %int8_3170 = torch.constant.int 8
    %int128_3171 = torch.constant.int 128
    %3105 = torch.prim.ListConstruct %3094, %int32_3169, %int8_3170, %int128_3171 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3106 = torch.aten.view %3104, %3105 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %3106, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int1_3172 = torch.constant.int 1
    %int32_3173 = torch.constant.int 32
    %int8_3174 = torch.constant.int 8
    %int128_3175 = torch.constant.int 128
    %3107 = torch.prim.ListConstruct %int1_3172, %670, %int32_3173, %int8_3174, %int128_3175 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3108 = torch.aten.view %3082, %3107 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3108, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_3176 = torch.constant.int 32
    %int8_3177 = torch.constant.int 8
    %int128_3178 = torch.constant.int 128
    %3109 = torch.prim.ListConstruct %670, %int32_3176, %int8_3177, %int128_3178 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3110 = torch.aten.view %3108, %3109 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3110, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_3179 = torch.constant.int 1
    %int1_3180 = torch.constant.int 1
    %3111 = torch.aten.add.Scalar %3084, %int1_3179, %int1_3180 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %3111, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %3112 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
    %3113 = torch.aten.view %3111, %3112 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %3113, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %3114 = torch.prim.ListConstruct %3113 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_3181 = torch.constant.bool false
    %3115 = torch.aten.index_put %3106, %3114, %3110, %false_3181 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
    torch.bind_symbolic_shape %3115, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
    %int32_3182 = torch.constant.int 32
    %int2_3183 = torch.constant.int 2
    %int32_3184 = torch.constant.int 32
    %int8_3185 = torch.constant.int 8
    %int128_3186 = torch.constant.int 128
    %3116 = torch.prim.ListConstruct %661, %int32_3182, %int2_3183, %int32_3184, %int8_3185, %int128_3186 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3117 = torch.aten.view %3115, %3116 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
    torch.bind_symbolic_shape %3117, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
    %int2097152_3187 = torch.constant.int 2097152
    %3118 = torch.prim.ListConstruct %661, %int2097152_3187 : (!torch.int, !torch.int) -> !torch.list<int>
    %3119 = torch.aten.view %3117, %3118 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
    torch.bind_symbolic_shape %3119, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
    %int-2_3188 = torch.constant.int -2
    %3120 = torch.aten.unsqueeze %3079, %int-2_3188 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3120, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_3189 = torch.constant.int 1
    %int8_3190 = torch.constant.int 8
    %int4_3191 = torch.constant.int 4
    %int128_3192 = torch.constant.int 128
    %3121 = torch.prim.ListConstruct %int1_3189, %3061, %int8_3190, %int4_3191, %int128_3192 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_3193 = torch.constant.bool false
    %3122 = torch.aten.expand %3120, %3121, %false_3193 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3122, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_3194 = torch.constant.int 0
    %3123 = torch.aten.clone %3122, %int0_3194 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3123, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_3195 = torch.constant.int 1
    %int32_3196 = torch.constant.int 32
    %int128_3197 = torch.constant.int 128
    %3124 = torch.prim.ListConstruct %int1_3195, %3061, %int32_3196, %int128_3197 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3125 = torch.aten._unsafe_view %3123, %3124 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3125, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_3198 = torch.constant.int -2
    %3126 = torch.aten.unsqueeze %3082, %int-2_3198 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3126, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_3199 = torch.constant.int 1
    %3127 = torch.aten.size.int %3013, %int1_3199 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_3200 = torch.constant.int 1
    %int8_3201 = torch.constant.int 8
    %int4_3202 = torch.constant.int 4
    %int128_3203 = torch.constant.int 128
    %3128 = torch.prim.ListConstruct %int1_3200, %3127, %int8_3201, %int4_3202, %int128_3203 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_3204 = torch.constant.bool false
    %3129 = torch.aten.expand %3126, %3128, %false_3204 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3129, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_3205 = torch.constant.int 0
    %3130 = torch.aten.clone %3129, %int0_3205 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3130, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_3206 = torch.constant.int 1
    %int32_3207 = torch.constant.int 32
    %int128_3208 = torch.constant.int 128
    %3131 = torch.prim.ListConstruct %int1_3206, %3127, %int32_3207, %int128_3208 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3132 = torch.aten._unsafe_view %3130, %3131 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3132, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_3209 = torch.constant.int 6
    %3133 = torch.prims.convert_element_type %3125, %int6_3209 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %3133, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %3134 = torch.aten.mul.Tensor %3133, %195 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %3134, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_3210 = torch.constant.int 15
    %3135 = torch.prims.convert_element_type %3134, %int15_3210 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %3135, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_3211 = torch.constant.int 6
    %3136 = torch.prims.convert_element_type %3132, %int6_3211 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %3136, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %3137 = torch.aten.mul.Tensor %3136, %195 : !torch.vten